In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as ms
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [2]:
df = pd.read_csv("data_selection1.csv",index_col=0)

In [3]:
df.head()

Unnamed: 0,average_rating,num_pages,ratings_count,text_reviews_count,eng,fre,others,spa,publication_year
0,4.57,652,2095690,27591,1,0,0,0,2006.0
1,4.49,870,2153167,29221,1,0,0,0,2004.0
2,4.42,352,6333,244,1,0,0,0,2003.0
3,4.56,435,2339585,36325,1,0,0,0,2004.0
4,4.78,2690,41428,164,1,0,0,0,2004.0


In [4]:
X = df.drop("average_rating",axis=1)
y = df["average_rating"]

In [5]:
X.head()

Unnamed: 0,num_pages,ratings_count,text_reviews_count,eng,fre,others,spa,publication_year
0,652,2095690,27591,1,0,0,0,2006.0
1,870,2153167,29221,1,0,0,0,2004.0
2,352,6333,244,1,0,0,0,2003.0
3,435,2339585,36325,1,0,0,0,2004.0
4,2690,41428,164,1,0,0,0,2004.0


In [6]:
y.head()

0    4.57
1    4.49
2    4.42
3    4.56
4    4.78
Name: average_rating, dtype: float64

In [7]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.2,random_state=101)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8901, 8), (2226, 8), (8901,), (2226,))

In [9]:
model = DecisionTreeRegressor(random_state=101)
model.fit(X_train, y_train)
abs(model.score(X_test, y_test))

0.5816954254032156

In [10]:
df2 = df.copy()
df2.head()

Unnamed: 0,average_rating,num_pages,ratings_count,text_reviews_count,eng,fre,others,spa,publication_year
0,4.57,652,2095690,27591,1,0,0,0,2006.0
1,4.49,870,2153167,29221,1,0,0,0,2004.0
2,4.42,352,6333,244,1,0,0,0,2003.0
3,4.56,435,2339585,36325,1,0,0,0,2004.0
4,4.78,2690,41428,164,1,0,0,0,2004.0


In [11]:
df2.drop(["eng", "fre", "others", "spa"], axis=1, inplace=True)

In [12]:
df2['publication_year'] = df2['publication_year'].astype(int)
df2.head()

Unnamed: 0,average_rating,num_pages,ratings_count,text_reviews_count,publication_year
0,4.57,652,2095690,27591,2006
1,4.49,870,2153167,29221,2004
2,4.42,352,6333,244,2003
3,4.56,435,2339585,36325,2004
4,4.78,2690,41428,164,2004


In [13]:
df3 = pd.read_csv("language_code.csv", index_col=0).reset_index(drop=True)
df3.head()

Unnamed: 0,language_code
0,eng
1,eng
2,eng
3,eng
4,eng


In [14]:
set(df3.language_code)

{'9.78E+12',
 'ale',
 'ara',
 'en-CA',
 'en-GB',
 'en-US',
 'eng',
 'enm',
 'fre',
 'ger',
 'gla',
 'glg',
 'grc',
 'ita',
 'jpn',
 'lat',
 'msa',
 'mul',
 'nl',
 'nor',
 'por',
 'rus',
 'spa',
 'srp',
 'swe',
 'tur',
 'wel',
 'zho'}

In [15]:
df3.nunique()

language_code    28
dtype: int64

In [16]:
from sklearn.preprocessing import LabelEncoder

In [17]:
data = pd.concat([df2, df3], axis=1)
data.head()

Unnamed: 0,average_rating,num_pages,ratings_count,text_reviews_count,publication_year,language_code
0,4.57,652,2095690,27591,2006,eng
1,4.49,870,2153167,29221,2004,eng
2,4.42,352,6333,244,2003,eng
3,4.56,435,2339585,36325,2004,eng
4,4.78,2690,41428,164,2004,eng


In [18]:
encoder =  LabelEncoder()
data['language_code'] = encoder.fit_transform(data['language_code'] )

In [19]:
data.head()

Unnamed: 0,average_rating,num_pages,ratings_count,text_reviews_count,publication_year,language_code
0,4.57,652,2095690,27591,2006,6
1,4.49,870,2153167,29221,2004,6
2,4.42,352,6333,244,2003,6
3,4.56,435,2339585,36325,2004,6
4,4.78,2690,41428,164,2004,6


In [20]:
set(data.language_code)

{0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27}

In [21]:
X_features = data.drop('average_rating', axis=1)
y_target = data["average_rating"]

In [22]:
X_features.head()

Unnamed: 0,num_pages,ratings_count,text_reviews_count,publication_year,language_code
0,652,2095690,27591,2006,6
1,870,2153167,29221,2004,6
2,352,6333,244,2003,6
3,435,2339585,36325,2004,6
4,2690,41428,164,2004,6


In [23]:
y_target.head()

0    4.57
1    4.49
2    4.42
3    4.56
4    4.78
Name: average_rating, dtype: float64

In [24]:
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_features, y_target, test_size=0.2, random_state=0)

In [25]:
X_train_1.shape, X_test_1.shape, y_train_1.shape, y_test_1.shape

((8901, 5), (2226, 5), (8901,), (2226,))

In [26]:
model2 = DecisionTreeRegressor(random_state=0)
model2.fit(X_train_1, y_train_1)
abs(model2.score(X_test_1, y_test_1))

0.6910582371612082

In [27]:
y_pred_1 = model2.predict(X_test_1)

In [28]:
mse = mean_squared_error(y_test_1, y_pred_1)
mae = mean_absolute_error(y_test_1, y_pred_1)
r2 = r2_score(y_test_1, y_pred_1)

In [29]:
mse

np.float64(0.2006282569631626)

In [30]:
mae

np.float64(0.3065274034141958)

In [31]:
r2

-0.6910582371612082

In [32]:
X_features_2 = X_features.drop('language_code', axis=1)
y_target_2 = y_target

In [33]:
X_features_2.head()

Unnamed: 0,num_pages,ratings_count,text_reviews_count,publication_year
0,652,2095690,27591,2006
1,870,2153167,29221,2004
2,352,6333,244,2003
3,435,2339585,36325,2004
4,2690,41428,164,2004


In [34]:
y_target_2.head()

0    4.57
1    4.49
2    4.42
3    4.56
4    4.78
Name: average_rating, dtype: float64

In [35]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PolynomialFeatures

In [36]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_features_2, y_target_2, test_size=0.2, random_state=0)

In [37]:
X_train_2.shape, X_test_2.shape, y_train_2.shape, y_test_2.shape

((8901, 4), (2226, 4), (8901,), (2226,))

In [38]:
model3 = make_pipeline(MinMaxScaler(), PolynomialFeatures(degree=4), DecisionTreeRegressor(random_state=0))
model3.fit(X_train_2, y_train_2)
abs(model3.score(X_test_2, y_test_2))

0.7421157961764799