In [287]:
import pandas as pd
from pandas.api.types import is_string_dtype
import numpy as np
from sklearn.linear_model import LinearRegression
import warnings
from sklearn.model_selection import train_test_split
import re
warnings.filterwarnings("ignore")

In [288]:
path_X = 'X.csv'
path_y = 'y.csv'

In [289]:
with open(path_X) as f:
    print(f)

<_io.TextIOWrapper name='X.csv' mode='r' encoding='cp1252'>


In [290]:
df_X = pd.read_csv(path_X)
df_y = pd.read_csv(path_y)

### Dropping irrelative feature 

Do không liên quan đến sách ta drop: link

In [291]:
df_X.drop(['link', 'author_link'], axis=1, inplace=True)

In [292]:
df_X.head(1)

Unnamed: 0.1,Unnamed: 0,id,series,author,rating_count,review_count,number_of_pages,date_published,publisher,original_title,genre_and_votes,isbn,isbn13,settings,characters,description,title,awards
0,0,2767052,(The Hunger Games #1),Suzanne Collins,6459237.0,173214.0,374.0,September 14th 2008,Scholastic Press,The Hunger Games,"Young Adult 30901, Fiction 17363, Science Fict...",439023483,9780440000000.0,"District 12, Panem\n\nCapitol, Panem\n\nPanem\...","Katniss Everdeen, Peeta Mellark, Cato (Hunger ...","Could you survive on your own in the wild, wit...",The Hunger Games,Locus Award Nominee for Best Young Adult Book ...


### Check nan cols

In [293]:
df_X.isna().sum()

Unnamed: 0             0
id                     0
series             12005
author                67
rating_count          67
review_count          67
number_of_pages      719
date_published       508
publisher            836
original_title      4460
genre_and_votes     2198
isbn                4268
isbn13              4496
settings           15391
characters         13902
description          377
title                 61
awards             15912
dtype: int64

In [294]:
df_X.isna().sum()/df_X.shape[0]

Unnamed: 0         0.000000
id                 0.000000
series             0.533556
author             0.002978
rating_count       0.002978
review_count       0.002978
number_of_pages    0.031956
date_published     0.022578
publisher          0.037156
original_title     0.198222
genre_and_votes    0.097689
isbn               0.189689
isbn13             0.199822
settings           0.684044
characters         0.617867
description        0.016756
title              0.002711
awards             0.707200
dtype: float64

Drop cột setting vì NaN nhiều và ko liên quan lắm đến sách

In [295]:
df_X.drop(['settings'], axis=1, inplace=True)

In [296]:
df_X.head(1)

Unnamed: 0.1,Unnamed: 0,id,series,author,rating_count,review_count,number_of_pages,date_published,publisher,original_title,genre_and_votes,isbn,isbn13,characters,description,title,awards
0,0,2767052,(The Hunger Games #1),Suzanne Collins,6459237.0,173214.0,374.0,September 14th 2008,Scholastic Press,The Hunger Games,"Young Adult 30901, Fiction 17363, Science Fict...",439023483,9780440000000.0,"Katniss Everdeen, Peeta Mellark, Cato (Hunger ...","Could you survive on your own in the wild, wit...",The Hunger Games,Locus Award Nominee for Best Young Adult Book ...


#### Reducing data types

In [297]:
df_X.dtypes

Unnamed: 0           int64
id                   int64
series              object
author              object
rating_count       float64
review_count       float64
number_of_pages    float64
date_published      object
publisher           object
original_title      object
genre_and_votes     object
isbn                object
isbn13              object
characters          object
description         object
title               object
awards              object
dtype: object

### Handling series feature

Giá trị ở cột 'sereis' có dạng (abc #x). Ta chỉ giữ lại abc.

In [298]:
def formatTextSeries(text):
  """
  Parameters: str
  Return: formated str
  """
  if text != text: # Check nan-value
    return text
  return text.strip('()').split('#')[0].strip()

In [299]:
def formatSeries(df):
  """
  Parameters: DataFrame
  Return: DataFrame with formated 'series' column
  """
  df_copy = df.copy()
  df_copy['series'] = df_copy['series'].apply(formatTextSeries)
  return df_copy

In [300]:
df_X['series'][4]

'(The Twilight Saga #1)'

In [301]:
df_X = formatSeries(df_X)

In [302]:
df_X['series'][4]

'The Twilight Saga'

### Formating date_published

Do có dòng có ngày đầy đủ có ngày chỉ có năm. Chuyển tất cả về thành năm.

In [303]:
def formatTextDate(text):
    if text != text: # Check nan-value
        return text
    temp = re.findall(r"[\w']+", text)[-1]
    if temp.isnumeric():
        return float(temp)
    return float('nan')

In [304]:
def formatDatePublished(df):
  df_copy = df.copy()
  df_copy['date_published'] = df_copy['date_published'].apply(formatTextDate)
  df_copy['date_published'] = df_copy['date_published'].astype(float)
  return df_copy

In [305]:
df_X['date_published'].head(5)

0    September 14th 2008
1                 Sep-04
2          May 23rd 2006
3      October 10th 2000
4     September 6th 2006
Name: date_published, dtype: object

In [306]:
df_X = formatDatePublished(df_X)

In [307]:
df_X['date_published'].head(5)

0    2008.0
1       4.0
2    2006.0
3    2000.0
4    2006.0
Name: date_published, dtype: float64

Xóa những dòng dữ liệu nhiễu 

In [308]:
df_X = df_X[df_X['date_published'] > 100]

In [309]:
df_X

Unnamed: 0.1,Unnamed: 0,id,series,author,rating_count,review_count,number_of_pages,date_published,publisher,original_title,genre_and_votes,isbn,isbn13,characters,description,title,awards
0,0,2767052,The Hunger Games,Suzanne Collins,6459237.0,173214.0,374.0,2008.0,Scholastic Press,The Hunger Games,"Young Adult 30901, Fiction 17363, Science Fict...",439023483,9.78044E+12,"Katniss Everdeen, Peeta Mellark, Cato (Hunger ...","Could you survive on your own in the wild, wit...",The Hunger Games,Locus Award Nominee for Best Young Adult Book ...
2,2,2657,To Kill a Mockingbird,Harper Lee,4569068.0,92506.0,324.0,2006.0,Harper Perennial Modern Classics,To Kill a Mockingbird,"Classics 45612, Fiction 22734, Historical-Hist...",,,"Scout Finch, Atticus Finch, Jem Finch, Arthur ...",The unforgettable novel of a childhood in a sl...,To Kill a Mockingbird,"Pulitzer Prize for Fiction (1961), Audie Award..."
3,3,1885,,Jane Austen,3047826.0,68657.0,279.0,2000.0,Modern Library,Pride and Prejudice,"Classics 49689, Fiction 15059, Romance 12015, ...",,,"Mr. Bennet, Mrs. Bennet, Jane Bennet, Elizabet...",Alternate cover edition of ISBN 9780679783268S...,Pride and Prejudice,
4,4,41865,The Twilight Saga,Stephenie Meyer,5029979.0,105360.0,501.0,2006.0,"Little, Brown and Company",Twilight,"Young Adult 19666, Fantasy 18533, Romance 1160...",316015849,9.78032E+12,"Edward Cullen, Jacob Black, Laurent, Renee, Be...",About three things I was absolutely positive.F...,Twilight,"Georgia Peach Book Award (2007), Buxtehuder Bu..."
5,5,19063,,Markus Zusak,1863936.0,114137.0,552.0,2006.0,Alfred A. Knopf,The Book Thief,"Historical-Historical Fiction 19377, Fiction 1...",375831002,9.78038E+12,"Liesel Meminger, Hans Hubermann, Rudy Steiner,...",Librarian&aposs note: An alternate cover editi...,The Book Thief,National Jewish Book Award for Children’s and ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22495,22495,10754265,Emaneska,Ben Galley,1598.0,214.0,418.0,2011.0,BenGalley.com,The Written,"Fantasy 235, Fantasy-Epic Fantasy 26, Fantasy-...",956770002,9.78096E+12,,His name is Farden. They whisper that he’s dan...,The Written,
22496,22496,6383762,Oathbreaker,S.R. Vaught,497.0,45.0,420.0,2009.0,Bloomsbury U.S.A. Children's Books,,"Fantasy 29, Young Adult 12, Fiction 4, Fantasy...",1599903768,9.7816E+12,,"In this second half of the Oathbreaker story, ...",A Prince Among Killers,
22497,22497,24308,The Amazing Days of Abby Hayes,Anne Mazer,990.0,29.0,144.0,2001.0,Scholastic Paperbacks,"Have Wheels, Will Travel (The Amazing Days of ...","Childrens 18, Fiction 10, Realistic Fiction 8,...",439178789,9.78044E+12,,Abby Hayes is sick of using her sister¹s batte...,"Have Wheels, Will Travel",
22498,22498,1162710,,Arthur O. Lovejoy,424.0,28.0,400.0,1976.0,Harvard University Press,,"Philosophy 107, History 68, Nonfiction 18, Rel...",674361539,9.78067E+12,,From later antiquity down to the close of the ...,The Great Chain of Being: A Study of the Histo...,


### Handling genre_and_vote

In [310]:
# Find all genre
a = df_X['genre_and_votes'].values.tolist()

In [311]:
c = []
for i in a:
  if i == i:
    c += re.findall(r"[a-zA-Z- ]+", i)

In [312]:
unique = set(c)
len(unique)

1029

Do có nhiều giá trị khác biệt nên không thể tách thành các cột. Mỗi dòng chỉ lấy 1 thể loại tương ứng nhiều bình chọn nhất.

In [313]:
def formatTextGenre(text):
  """
  Parameters: str ('History 19, Fiction 10')
  Return: str (History)
  """
  if text != text:
    return text
  return re.findall(r"[a-zA-Z- ]+", text)[0]

In [314]:
def formatGenre(df):
  """
  Parameters: DataFrame
  Return: DataFrame with formated genre
  """
  df_copy = df.copy()
  df_copy['genre_and_votes'] = df_copy['genre_and_votes'].apply(formatTextGenre)
  df_copy.rename(columns={'genre_and_votes':'genre'}, inplace=True)
  return df_copy

In [315]:
df_X['genre_and_votes'].head(5)

0    Young Adult 30901, Fiction 17363, Science Fict...
2    Classics 45612, Fiction 22734, Historical-Hist...
3    Classics 49689, Fiction 15059, Romance 12015, ...
4    Young Adult 19666, Fantasy 18533, Romance 1160...
5    Historical-Historical Fiction 19377, Fiction 1...
Name: genre_and_votes, dtype: object

In [316]:
df_X = formatGenre(df_X)

In [317]:
df_X['genre'].head(5)

0                      Young Adult 
2                         Classics 
3                         Classics 
4                      Young Adult 
5    Historical-Historical Fiction 
Name: genre, dtype: object

### Handling award

In [318]:
awards = df_X['awards'].values.tolist()

In [319]:
c = []
count_awards = []
for i in awards:
  if i == i:
    arr = i.split(',')
    count_awards.append(len(arr))
    for a in arr:
      c.append(a.split('(')[0].split('for')[0].strip())
  else:
    count_awards.append(0)

In [320]:
len(set(c))

1812

Do có quá nhiều giải khác nhau. Không thể tách thành các cột cho từng giải. Ta xét số lượng giải nhận được. những dòng na sẽ nhận giá trị 0.

In [321]:
def formatTextAward(text):
  if text != text:
    return 0
  return len(text.split(','))

In [322]:
def formatAward(df):
  df_copy = df.copy()
  df_copy['awards'] = df_copy['awards'].apply(formatTextAward)
  return df_copy

In [323]:
df_X['awards'].head(5)

0    Locus Award Nominee for Best Young Adult Book ...
2    Pulitzer Prize for Fiction (1961), Audie Award...
3                                                  NaN
4    Georgia Peach Book Award (2007), Buxtehuder Bu...
5    National Jewish Book Award for Children’s and ...
Name: awards, dtype: object

In [324]:
df_X = formatAward(df_X)

In [325]:
df_X['awards'].head(5)

0    40
2     4
3     0
4    24
5    19
Name: awards, dtype: int64

### Handling description

In [326]:
a = df_X['description'].values.tolist()

In [327]:
b = []
for i in a:
  if i == i:
    b.append(len(i))
  else:
    b.append(0)

In [328]:
len(set(b))

2588

In [329]:
def formatTextDescription(text):
  if text != text:
    return 0
  return len(text)  

In [330]:
def formatDescription(df):
  df_copy = df.copy()
  df_copy['description'] = df_copy['description'].apply(formatTextDescription)
  return df_copy

In [331]:
df_X['description'].head(5)

0    Could you survive on your own in the wild, wit...
2    The unforgettable novel of a childhood in a sl...
3    Alternate cover edition of ISBN 9780679783268S...
4    About three things I was absolutely positive.F...
5    Librarian&aposs note: An alternate cover editi...
Name: description, dtype: object

In [332]:
df_X = formatDescription(df_X)

In [333]:
df_X['description'].head(3)

0    1298
2    1202
3    1032
Name: description, dtype: int64

In [334]:
df_X.head(3)

Unnamed: 0.1,Unnamed: 0,id,series,author,rating_count,review_count,number_of_pages,date_published,publisher,original_title,genre,isbn,isbn13,characters,description,title,awards
0,0,2767052,The Hunger Games,Suzanne Collins,6459237.0,173214.0,374.0,2008.0,Scholastic Press,The Hunger Games,Young Adult,439023483.0,9780440000000.0,"Katniss Everdeen, Peeta Mellark, Cato (Hunger ...",1298,The Hunger Games,40
2,2,2657,To Kill a Mockingbird,Harper Lee,4569068.0,92506.0,324.0,2006.0,Harper Perennial Modern Classics,To Kill a Mockingbird,Classics,,,"Scout Finch, Atticus Finch, Jem Finch, Arthur ...",1202,To Kill a Mockingbird,4
3,3,1885,,Jane Austen,3047826.0,68657.0,279.0,2000.0,Modern Library,Pride and Prejudice,Classics,,,"Mr. Bennet, Mrs. Bennet, Jane Bennet, Elizabet...",1032,Pride and Prejudice,0


### Check unique values

In [335]:
df_X.nunique()

Unnamed: 0         21128
id                 21128
series              5078
author             10394
rating_count       14606
review_count        5363
number_of_pages     1174
date_published        95
publisher           5174
original_title     16671
genre                235
isbn               17447
isbn13               384
characters          7588
description         2588
title              20370
awards                26
dtype: int64

In [336]:
df_X.nunique()/df_X.shape[0]

Unnamed: 0         1.000000
id                 1.000000
series             0.240345
author             0.491954
rating_count       0.691310
review_count       0.253834
number_of_pages    0.055566
date_published     0.004496
publisher          0.244888
original_title     0.789048
genre              0.011123
isbn               0.825776
isbn13             0.018175
characters         0.359144
description        0.122491
title              0.964123
awards             0.001231
dtype: float64

Do title, original_title, isbn, isbn13 quá nhiều giá trị khác biệt. Vì vậy có ít sự ảnh hưởng đến mô hình.

### Droping title, original_title, isbn, isbn13

In [337]:
df_X.drop(['title','original_title', 'isbn', 'isbn13'], axis=1, inplace=True)

In [338]:
df_X.head(1)

Unnamed: 0.1,Unnamed: 0,id,series,author,rating_count,review_count,number_of_pages,date_published,publisher,genre,characters,description,awards
0,0,2767052,The Hunger Games,Suzanne Collins,6459237.0,173214.0,374.0,2008.0,Scholastic Press,Young Adult,"Katniss Everdeen, Peeta Mellark, Cato (Hunger ...",1298,40


In [339]:
df_X.nunique()

Unnamed: 0         21128
id                 21128
series              5078
author             10394
rating_count       14606
review_count        5363
number_of_pages     1174
date_published        95
publisher           5174
genre                235
characters          7588
description         2588
awards                26
dtype: int64

### Handling author

In [340]:
df_X['author'].head(10)

0       Suzanne Collins
2            Harper Lee
3           Jane Austen
4       Stephenie Meyer
5          Markus Zusak
7            C.S. Lewis
8        J.R.R. Tolkien
9            John Green
10    Margaret Mitchell
11        Douglas Adams
Name: author, dtype: object

In [341]:
authors = []
authors_raw = df_X['author'].values.tolist()
for i in authors_raw:
  if i == i:
    arr = i.split(',')
    for a in arr:
      authors.append(a.strip())

In [342]:
unique_authors = set(authors)
len(unique_authors)

10397

Do quá nhiều giá trị khác biệt. Vì vậy xóa cột 'authors'.

In [343]:
df_X.drop('author', axis=1, inplace=True)

In [344]:
df_X.head(1)

Unnamed: 0.1,Unnamed: 0,id,series,rating_count,review_count,number_of_pages,date_published,publisher,genre,characters,description,awards
0,0,2767052,The Hunger Games,6459237.0,173214.0,374.0,2008.0,Scholastic Press,Young Adult,"Katniss Everdeen, Peeta Mellark, Cato (Hunger ...",1298,40


In [345]:
df_X.nunique()

Unnamed: 0         21128
id                 21128
series              5078
rating_count       14606
review_count        5363
number_of_pages     1174
date_published        95
publisher           5174
genre                235
characters          7588
description         2588
awards                26
dtype: int64

In [346]:
df_X.dtypes

Unnamed: 0           int64
id                   int64
series              object
rating_count       float64
review_count       float64
number_of_pages    float64
date_published     float64
publisher           object
genre               object
characters          object
description          int64
awards               int64
dtype: object

In [347]:
df_X.drop('characters', axis=1, inplace=True)

### Handle non-numeric title

Do các cột không là số (series, publisher, genre) đều không là các giá trị binary hoặc có thể xác định trước nên không thể tách thành các cột. Mình sẽ xử lý bằng cách gán số cho nó . Mỗi unique sẽ được gán 1 con số riêng khác nhau

In [348]:
import pickle

In [349]:
def save_map(map, map_name):
  with open(map_name + '.pkl', 'wb') as f:
    pickle.dump(map, f, pickle.HIGHEST_PROTOCOL)

def load_map(map_name):
  with open(map_name + '.pkl', 'rb') as f:
    return pickle.load(f)

In [350]:
def createMappingTable(df, column, text_digit_vals):
  df_copy = df.copy()
  text_digit_vals_column = {}
  def convertToInt(text):
    if text != text or text == '' or text == 0:
      return 0
    return int(text_digit_vals_column[text])
  column_contents = df_copy[column].values.tolist()
  unique_elements = set(column_contents)
  x = 1
  for unique in unique_elements:
    if unique not in text_digit_vals_column and unique == unique and unique != 0:
      text_digit_vals_column[unique] = x
      x += 1
  text_digit_vals[column] = text_digit_vals_column
  df_copy[column] = list(map(convertToInt, df_copy[column]))
  return df_copy

In [351]:
def mapData(df, column, text_digit_vals):
  df_copy = df.copy()
  text_digit_vals_column = text_digit_vals[column]
  def convertToInt(text):
    if text != text or text == '' or text == 0:
      return 0
    return int(text_digit_vals_column[text])
  column_contents = df_copy[column].values.tolist()
  unique_elements = set(column_contents)
  #x = len(text_digit_vals_column)
  for unique in unique_elements:
    if unique not in text_digit_vals_column and unique == unique and unique != 0:  
      text_digit_vals_column[unique] = 0
      #x += 1
  df_copy[column] = list(map(convertToInt, df_copy[column]))
  return df_copy

In [352]:
df_X.head(3)

Unnamed: 0.1,Unnamed: 0,id,series,rating_count,review_count,number_of_pages,date_published,publisher,genre,description,awards
0,0,2767052,The Hunger Games,6459237.0,173214.0,374.0,2008.0,Scholastic Press,Young Adult,1298,40
2,2,2657,To Kill a Mockingbird,4569068.0,92506.0,324.0,2006.0,Harper Perennial Modern Classics,Classics,1202,4
3,3,1885,,3047826.0,68657.0,279.0,2000.0,Modern Library,Classics,1032,0


In [353]:
text_digit_vals = {}
df_X = createMappingTable(df_X,'series', text_digit_vals)
df_X = createMappingTable(df_X,'publisher', text_digit_vals)
df_X = createMappingTable(df_X,'genre', text_digit_vals)

In [354]:
save_map(text_digit_vals,'map')

In [355]:
df_X.head()

Unnamed: 0.1,Unnamed: 0,id,series,rating_count,review_count,number_of_pages,date_published,publisher,genre,description,awards
0,0,2767052,3352,6459237.0,173214.0,374.0,2008.0,993,44,1298,40
2,2,2657,1771,4569068.0,92506.0,324.0,2006.0,3701,120,1202,4
3,3,1885,0,3047826.0,68657.0,279.0,2000.0,937,120,1032,0
4,4,41865,1181,5029979.0,105360.0,501.0,2006.0,5165,44,679,24
5,5,19063,0,1863936.0,114137.0,552.0,2006.0,3789,96,1383,19


In [356]:
df_X.drop('Unnamed: 0', axis=1, inplace=True)

In [357]:
df_X

Unnamed: 0,id,series,rating_count,review_count,number_of_pages,date_published,publisher,genre,description,awards
0,2767052,3352,6459237.0,173214.0,374.0,2008.0,993,44,1298,40
2,2657,1771,4569068.0,92506.0,324.0,2006.0,3701,120,1202,4
3,1885,0,3047826.0,68657.0,279.0,2000.0,937,120,1032,0
4,41865,1181,5029979.0,105360.0,501.0,2006.0,5165,44,679,24
5,19063,0,1863936.0,114137.0,552.0,2006.0,3789,96,1383,19
...,...,...,...,...,...,...,...,...,...,...
22495,10754265,795,1598.0,214.0,418.0,2011.0,2593,163,1249,0
22496,6383762,3471,497.0,45.0,420.0,2009.0,771,163,949,0
22497,24308,3719,990.0,29.0,144.0,2001.0,3937,26,297,0
22498,1162710,0,424.0,28.0,400.0,1976.0,1698,134,1247,0


### Join X and y

In [358]:
df_X['id'] = df_X['id'].astype(int)

In [359]:
df_y['id'] = df_y['id'].astype(int)

In [360]:
df = df_X.set_index('id').join(df_y.set_index('id'))
df.reset_index(inplace=True)

In [361]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [362]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21128 entries, 0 to 21127
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               21128 non-null  int64  
 1   series           21128 non-null  int64  
 2   rating_count     21128 non-null  float64
 3   review_count     21128 non-null  float64
 4   number_of_pages  20701 non-null  float64
 5   date_published   21128 non-null  float64
 6   publisher        21128 non-null  int64  
 7   genre            21128 non-null  int64  
 8   description      21128 non-null  int64  
 9   awards           21128 non-null  int64  
 10  rating           21128 non-null  float64
dtypes: float64(5), int64(6)
memory usage: 1.8 MB


### Drop duplicate rows

In [363]:
df = df.drop_duplicates()
df.reset_index(inplace=True, drop=True)

In [364]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21128 entries, 0 to 21127
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               21128 non-null  int64  
 1   series           21128 non-null  int64  
 2   rating_count     21128 non-null  float64
 3   review_count     21128 non-null  float64
 4   number_of_pages  20701 non-null  float64
 5   date_published   21128 non-null  float64
 6   publisher        21128 non-null  int64  
 7   genre            21128 non-null  int64  
 8   description      21128 non-null  int64  
 9   awards           21128 non-null  int64  
 10  rating           21128 non-null  float64
dtypes: float64(5), int64(6)
memory usage: 1.8 MB


### Remove rows noise data

In [365]:
df = df[df['rating_count'] >= 1]

In [366]:
df = df[df['number_of_pages'] >= 1]

### Drop over nan values rows

In [367]:
df.isna().sum()

id                 0
series             0
rating_count       0
review_count       0
number_of_pages    0
date_published     0
publisher          0
genre              0
description        0
awards             0
rating             0
dtype: int64

In [368]:
per = 0.5 # Phan tram gia tri khac na
df_dropped = df.dropna(axis=1,thresh=int(df.shape[0]*per)) # Drop features with over 50% nan values
df_dropped_2 = df.dropna(axis=0,thresh=int(df.shape[1]*per)) 
#df_dropped_2 = df_dropped.dropna(how='any') # Drop 
df_dropped_2.reset_index(inplace=True, drop=True)

In [369]:
df_dropped_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20642 entries, 0 to 20641
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               20642 non-null  int64  
 1   series           20642 non-null  int64  
 2   rating_count     20642 non-null  float64
 3   review_count     20642 non-null  float64
 4   number_of_pages  20642 non-null  float64
 5   date_published   20642 non-null  float64
 6   publisher        20642 non-null  int64  
 7   genre            20642 non-null  int64  
 8   description      20642 non-null  int64  
 9   awards           20642 non-null  int64  
 10  rating           20642 non-null  float64
dtypes: float64(5), int64(6)
memory usage: 1.7 MB


### Data set preprocessing

In [370]:
def preprocess(df):
  df_copy = df.copy()
  #df_copy = correctDataEncoding(df_copy)
  list_col_dropped = ['link', 'author_link',  
                      'settings', 'characters', 'author', 
                      'original_title', 'isbn', 'isbn13', 'id']
  df_copy.drop(list_col_dropped, axis=1, inplace=True)
  df_copy = formatSeries(df_copy)
  df_copy = formatDatePublished(df_copy)
  df_copy = formatAward(df_copy)
  df_copy = formatGenre(df_copy)
  df_copy = formatDescription(df_copy)
  text_digit_vals = load_map('map')
  df_copy = mapData(df_copy, 'series', text_digit_vals)

  df_copy = mapData(df_copy, 'publisher', text_digit_vals)

  df_copy = mapData(df_copy, 'genre', text_digit_vals)
  #df_copy = fillNaN(df_copy)
  return df_copy

### Processing dummies

In [371]:
data_dummies = pd.get_dummies(df_dropped_2, drop_first=True)
data_dummies = data_dummies.astype(float)
cols = data_dummies.columns.values
data_preprocessed = data_dummies[cols]

In [372]:
data_preprocessed.head(5)

Unnamed: 0,id,series,rating_count,review_count,number_of_pages,date_published,publisher,genre,description,awards,rating
0,2767052.0,3352.0,6459237.0,173214.0,374.0,2008.0,993.0,44.0,1298.0,40.0,4.32
1,2657.0,1771.0,4569068.0,92506.0,324.0,2006.0,3701.0,120.0,1202.0,4.0,4.28
2,1885.0,0.0,3047826.0,68657.0,279.0,2000.0,937.0,120.0,1032.0,0.0,4.26
3,41865.0,1181.0,5029979.0,105360.0,501.0,2006.0,5165.0,44.0,679.0,24.0,3.6
4,19063.0,0.0,1863936.0,114137.0,552.0,2006.0,3789.0,96.0,1383.0,19.0,4.37


In [373]:
data_preprocessed.shape

(20642, 11)

In [374]:
X = data_preprocessed.drop(['id', 'rating'], axis=1)
y = data_preprocessed['rating'].values

### Split train and valid data

In [375]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y,test_size=0.2,random_state=365)

### Training

In [376]:
def save_model(model,model_name):
  with open(model_name + '.pkl', 'wb') as f:
    pickle.dump(model, f, pickle.HIGHEST_PROTOCOL)

def load_model(model_name):
  with open(model_name + '.pkl', 'rb') as f:
    return pickle.load(f)

In [377]:
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler

In [378]:
# Linear Regression
reg_LR = make_pipeline(StandardScaler(), LinearRegression())
# Support Vector Regression
reg_SVR = make_pipeline(StandardScaler(), SVR(C=0.5, gamma=0.1))
# MLP Regression
reg_MLPR = make_pipeline(StandardScaler(), MLPRegressor(random_state=1, max_iter=500))

In [379]:
reg_LR.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression())])

In [380]:
reg_SVR.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svr', SVR(C=0.5, gamma=0.1))])

In [381]:
reg_MLPR.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('mlpregressor', MLPRegressor(max_iter=500, random_state=1))])

In [382]:
save_model(reg_LR, 'LR')
save_model(reg_SVR, 'SVR')
save_model(reg_MLPR, 'MLPR')

### Evaluating

Ở đây nhóm em sử dụng 3 độ đo : MAE , MSE , R2

In [383]:
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import r2_score as R2

In [384]:
def scoreModel(y_true, y_pre):
  """
  Parameters: y_true: , y_pre: predicted values
  Returns: MAE, MSE, R2 metric 
  """
  MAE_score = MAE(y_true, y_pre)
  MSE_score = MSE(y_true, y_pre)
  R2_score = R2(y_true, y_pre)
  print('MAE: ', MAE_score, '\n' +
      'MSE: ', MSE_score, '\n' +
      'R2: ', R2_score, '\n')
  return MAE_score, MSE_score, R2_score

#### Train set

In [385]:
def getResult(X, y):
  y_pre_LR = reg_LR.predict(X)
  y_pre_SVR = reg_SVR.predict(X)
  y_pre_MLPR = reg_MLPR.predict(X)
  LR = scoreModel(y, y_pre_LR)
  SVR = scoreModel(y, y_pre_SVR)
  MLPR = scoreModel(y, y_pre_MLPR)
  result_table = pd.DataFrame([['Linear Regression'] + list(LR),
                ['Support Vector Regression (C=0.5, gamma=0.1)'] + list(SVR),
                ['Multi-Layer Perceptron Regressor (Neural Network)'] + list(MLPR)], columns=['Model', 'MAE', 'MSE', 'R2'])
  return result_table
      

In [386]:
getResult(X_train, y_train)

MAE:  0.22822532677561458 
MSE:  0.0911351949222365 
R2:  0.044031617439876936 

MAE:  0.2073206159266475 
MSE:  0.0776268943976651 
R2:  0.18572778887649244 

MAE:  0.21383008080795032 
MSE:  0.10200774366027858 
R2:  -0.07001666917739513 



Unnamed: 0,Model,MAE,MSE,R2
0,Linear Regression,0.228225,0.091135,0.044032
1,"Support Vector Regression (C=0.5, gamma=0.1)",0.207321,0.077627,0.185728
2,Multi-Layer Perceptron Regressor (Neural Network),0.21383,0.102008,-0.070017


#### Test set

In [387]:
getResult(X_valid, y_valid)

MAE:  0.23015202643280055 
MSE:  0.09129112070924991 
R2:  0.05240357539841811 

MAE:  0.21372007430676312 
MSE:  0.08003278599693658 
R2:  0.16926442273463704 

MAE:  0.2197913291737955 
MSE:  0.1272046263475972 
R2:  -0.32037648550358866 



Unnamed: 0,Model,MAE,MSE,R2
0,Linear Regression,0.230152,0.091291,0.052404
1,"Support Vector Regression (C=0.5, gamma=0.1)",0.21372,0.080033,0.169264
2,Multi-Layer Perceptron Regressor (Neural Network),0.219791,0.127205,-0.320376


**Nhận xét** : Có thể thấy trong 3 độ đo thì Support Vector Regression (C=0.5, gamma=0.1) cho ra kết quả tốt nhất. Nên mình sẽ dùng model này để chạy và in ra file dự đoán cho kết quả bài toán

In [388]:
predict= reg_SVR.predict(X_valid)

In [389]:
frame = pd.DataFrame( {'Predict' : predict,
                       'Rating' : y_valid
                      }
                    )

In [390]:
frame.to_csv('predict.csv')