# Binary Attribute in Machine Learning Model

## Data Preparation

In [None]:
import pandas as pd
import numpy as np

In [None]:
a = pd.read_csv('data',index_col = 'Unnamed: 0')
a.set_index('index',inplace = True)
a

In [None]:
# set the binary value for each feature
a['director_binary'] = np.where(a['director'].notnull(),1,0)
a['genres_binary'] = np.where(a['genres'].notnull(),1,0)
a['keywords_binary'] = np.where(a['keywords'].notnull(),1,0)
a['rating_value_binary'] = np.where(a['rating_value'] >= 7,1,0)
a['release_country_binary'] = np.where(a['release_country'].notnull(),1,0)
a['release_date_binary'] = np.where(a['release_date'].notnull(),1,0)
a['stars_binary'] = np.where(a['stars'].notnull(),1,0)
a['storyline_binary'] = np.where(a['storyline'].notnull(),1,0)
a['writers_binary'] = np.where(a['writers'].notnull(),1,0)

In [None]:
# get the compound value for each movie's review
df_SA = pd.read_csv('sentiment_analysis_result',index_col = 'Unnamed: 0')
a['compound'] = df_SA['compound']
# Normalize the compound value
a['compound_grade'] = a['compound']/2+1/2

In [None]:
# put movies without rating value into data_null, put movie with rating value into data_notnull
data_notnull = a[a['rating_value'].notnull()][['director_binary','genres_binary','keywords_binary','release_country_binary','release_date_binary','stars_binary','storyline_binary','writers_binary','compound_grade','rating_value_binary']]
data_null = a[a['rating_value'].isnull()][['director_binary','genres_binary','keywords_binary','release_country_binary','release_date_binary','stars_binary','storyline_binary','writers_binary','compound_grade']]

In [None]:
# build train and test samples
from sklearn.model_selection import train_test_split
train, test = train_test_split(data_notnull, test_size = 0.3)
x_train = train.iloc[0:,:-1]
y_train = train['rating_value_binary']
x_test = test.iloc[0:,:-1]
y_test = test['rating_value_binary']

## Logistic Regression

In [None]:
# draw cool-warm map 
import matplotlib.pyplot as plot
plot.pcolor(data_notnull.corr(),cmap='coolwarm') #https://matplotlib.org/examples/color/colormaps_reference.html
plot.show()

In [None]:
from sklearn import linear_model
model = linear_model.LogisticRegression()
model.fit(x_train,y_train)
predictions = model.predict(x_test)
actuals = np.array(y_test)
logistic_acc = model.score(x_test,actuals)
print(logistic_acc)

In [None]:
# prediction for movies without rating value
prediction = data_null[['compound_grade']]
lr_predictions = model.predict(data_null)
prediction['lr_predictions'] = lr_predictions
prediction

## Random Forest

In [None]:
# get the best combination for parameters
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
parameters = {
     'n_estimators':(10, 30, 50, 100), #the number of trees
     'max_depth':(4,5,6,8,10,15),
     'min_samples_split': (2, 4, 8),
     'min_samples_leaf': (4,8,12,16)
}

model = GridSearchCV(RandomForestClassifier(),parameters,cv=3,iid=False)
model.fit(x_train, np.ravel(y_train))
model.best_score_, model.best_params_

In [None]:
# get the accuracy for the combination
rf = RandomForestClassifier(max_depth=6,min_samples_leaf=8,min_samples_split=8,n_estimators=50)
rf.fit(x_train,np.ravel(y_train))
rf_acc = rf.score(x_test,y_test)
print(rf_acc)

In [None]:
# prediction for movies without rating value
rf_predictions = rf.predict(data_null)
prediction['rf_predictions'] = rf_predictions
prediction

In [None]:
# Draw feature importance image
import matplotlib.pyplot as plt
%matplotlib inline
feature_names = [key for key in data_null]
importances = rf.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12,12))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='y', align='center')
plt.yticks(range(len(indices)),feature_names)
plt.xlabel('Relative Importance')

## Neural Network

In [None]:
# get the best combination for parameters
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
parameters = {
    'learning_rate':["constant", "invscaling", "adaptive"],
    'solver': ('sgd','lbfgs','adam'),
    'activation': ('logistic','tanh','relu'),
    'hidden_layer_sizes': ((30,),(60,),(80,)),
    'max_iter': (500, 1000)
}
gs = GridSearchCV(estimator = MLPClassifier(), param_grid=parameters,cv=5)
gs.fit(x_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

In [None]:
# get the accuracy for the combination
clf = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(80,), max_iter = 1000, 
                    activation='relu',
                    learning_rate='constant')
clf.fit(x_train,y_train)
predictions = clf.predict(x_test)
actuals = y_test
tp=tn=fp=fn=0
for i in range(len(actuals)):
    a_class=p_class=0
    if int(actuals[i] == 0):
        a_class = 1 
    if int(predictions[i] == 0):
        p_class = 1
    if a_class == 1 and p_class == 1:
        tp +=1
    elif a_class == 1 and p_class == 0:
        fn +=1
    elif a_class == 0 and p_class == 0:
        tn +=1
    elif a_class == 0 and p_class == 1:
        fp +=1
print(tp,tn,fp,fn)
nn_accuracy = ((tp+tn)*100/(tp+tn+fp+fn))
print(nn_accuracy)

In [None]:
# prediction for movies without rating value
nn_predictions = clf.predict(data_null)
prediction['nn_predictions'] = nn_predictions
prediction = prediction.drop(columns = 'compound_grade')
prediction

## Save the result into csv file

In [None]:
prediction.to_csv('prediction')