# Numerical Attribute in Machine Learning Model

## Data Preparation

In [None]:
import pandas as pd
import numpy as np

In [None]:
a = pd.read_csv('data',index_col = 'Unnamed: 0')
a.set_index('index',inplace = True)
a['rating_value_binary'] = np.where(a['rating_value'] >= 7,1,0)

In [None]:
# get value for each movie's feature
b = pd.read_csv('grade',index_col = 'Unnamed: 0')

In [None]:
# get the compound value for each movie's review
df_SA = pd.read_csv('sentiment_analysis_result',index_col = 'Unnamed: 0')
a['compound'] = df_SA['compound']
# Normalize the compound value
a['compound_grade'] = a['compound']/2+1/2

In [None]:
b['compound'] = a['compound_grade']

In [None]:
y_data = a[a['rating_value'].notnull()][['rating_value_binary']]
y = np.array(y_data)

In [None]:
# put movies with rating value into x_and_y
x_and_y = y_data.join(b)
x_and_y

In [None]:
# put movies without rating value into x_not_y
x_not_y = a[a['rating_value'].isnull()][['rating_value_binary']].join(b)
x_not_y = x_not_y.drop(columns = ['rating_value_binary'])
x_not_y

In [None]:
# build train and test samples
from sklearn.model_selection import train_test_split
train, test = train_test_split(x_and_y, test_size = 0.3)
x_train = train.iloc[0:,1:]
y_train = train['rating_value_binary']
x_test = test.iloc[0:,1:]
y_test = test['rating_value_binary']

## Logistic Regression

In [None]:
# draw cool-warm map 
import matplotlib.pyplot as plot
plot.pcolor(x_not_y.corr(),cmap='coolwarm')
plot.show()

In [None]:
from sklearn import linear_model
model = linear_model.LogisticRegression()
model.fit(x_train,y_train)
predictions = model.predict(x_test)
actuals = np.array(y_test)
logistic_acc = model.score(x_test,actuals)
print(logistic_acc)

In [None]:
# prediction for movies without rating value
prediction = x_not_y[['compound']]
lr_predictions = model.predict(x_not_y)
prediction['lr_predictions'] = lr_predictions
prediction

## Random Forest

In [None]:
# get the best combination for parameters
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
parameters = {
     'n_estimators':(10, 30, 50, 100), #the number of trees
     'max_depth':(4,5,6,8,10,15),
     'min_samples_split': (2, 4, 8),
     'min_samples_leaf': (4,8,12,16)
}

model = GridSearchCV(RandomForestClassifier(),parameters,cv=3,iid=False)
model.fit(x_train, np.ravel(y_train))
model.best_score_, model.best_params_

In [None]:
# get the accuracy for the combination
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_depth=15,min_samples_leaf=4,min_samples_split=4,n_estimators=100)
rf.fit(x_train,np.ravel(y_train))
rf_acc = rf.score(x_test,y_test)
print(rf_acc)

In [None]:
# prediction for movies without rating value
rf_predictions = rf.predict(x_not_y)
prediction['rf_predictions'] = rf_predictions
prediction

In [None]:
# Draw feature importance image
import matplotlib.pyplot as plt
%matplotlib inline
feature_names = [key for key in x_not_y]
importances = rf.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12,12))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='y', align='center')
plt.yticks(range(len(indices)),feature_names)
plt.xlabel('Relative Importance')

## Neural Network

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
parameters = {
    'learning_rate':["constant", "invscaling", "adaptive"],
    'solver': ('sgd','lbfgs','adam'),
    'activation': ('logistic','tanh','relu'),
    'hidden_layer_sizes': ((30,),(60,),(80,)),
    'max_iter': (1500,)
}
gs = GridSearchCV(estimator = MLPClassifier(), param_grid=parameters,cv=5)
gs.fit(x_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

In [None]:
# get the accuracy for the combination
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(60,), max_iter = 1000, 
                    activation='tanh',
                    learning_rate='adaptive')
clf.fit(x_train,y_train)
predictions = clf.predict(x_test)
actuals = np.array(y_test)
tp=tn=fp=fn=0
for i in range(len(actuals)):
    a_class=p_class=0
    if int(actuals[i] == 0):
        a_class = 1 
    if int(predictions[i] == 0):
        p_class = 1
    if a_class == 1 and p_class == 1:
        tp +=1
    elif a_class == 1 and p_class == 0:
        fn +=1
    elif a_class == 0 and p_class == 0:
        tn +=1
    elif a_class == 0 and p_class == 1:
        fp +=1
print(tp,tn,fp,fn)
nn_accuracy = ((tp+tn)*100/(tp+tn+fp+fn))
print(nn_accuracy)

In [None]:
# prediction for movies without rating value
nn_predictions = clf.predict(x_not_y)
prediction['nn_predictions'] = nn_predictions
prediction = prediction.drop(columns = 'compound')
prediction

## Put the result into csv file

In [None]:
prediction.to_csv('prediction1')

## Use the prediction result fill None value in raing_value

In [None]:
prediction_data = pd.read_csv('prediction1')
origin_data = pd.read_csv('data',index_col='Unnamed: 0')

In [None]:
# set estimated scores greater than 7 to exactly 7.5, others to 4.5
prediction_data['pretend_rating_value'] = np.where(prediction_data.apply(lambda x: x['nn_predictions']==1, axis=1),7.5,4.5)

In [None]:
update_rating_value = prediction_data[['index','pretend_rating_value']]

In [None]:
# merge the predict rating value into original dataframe
update_data = pd.merge(origin_data,update_rating_value,on='index',how='outer')

In [None]:
# use predict rating value fill none value in orginal rating value
update_data['rating_value'].fillna(update_data['pretend_rating_value'], inplace=True)

In [None]:
# drop column ana write data to a new .csv file
update_data.drop(columns='pretend_rating_value',inplace=True)
update_data.to_csv('update_data')