# Predicting Movie Likes and Dislikes

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import statsmodels.api as sm
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn import metrics
import seaborn as sea

In [None]:
movies = pd.read_csv('movies.csv', encoding = 'latin-1')

In [None]:
movies

In [None]:
movies = movies.drop(columns =['country', 'rating', 'released', 'votes', 'writer', 'director', 'star'], axis =1)

In [None]:
avg_score = movies['score'].mean()

In [None]:
movies['score'][movies['score'] < avg_score]= 0
movies['score'][movies['score'] > avg_score]= 1

In [None]:
x = movies['gross']
y = movies['budget']

x_constant = sm.add_constant(x)
gross_budget_model = sm.OLS(y, x_constant)
results = gross_budget_model.fit()
print("Intercept and slope are:", results.params)

In [None]:
m = results.params[0]
b = results.params[1]
#budget_df = movies[movies['budget']==0.0]
#budget_zero = budget_df['budget']
for i in range(movies.shape[0]):
    if movies['budget'][i] ==0.0:
        gross_val = movies['gross'][i]
        y = m*gross_val + b
        movies['budget'].iloc[i] = y

In [None]:
movies

In [None]:
encoder = OneHotEncoder()
genre = movies['genre']
genre_np = genre.to_numpy()
genre_ary = encoder.fit_transform(genre_np.reshape(-1,1)).toarray()

In [None]:
genre_df = pd.DataFrame(genre_ary)

In [None]:
genre_df = genre_df.rename({0:'Action', 1:'Adventure', 2:'Animation', 3:'Biography', 4:'Comedy', 5:'Crime', 
                            6:'Drama', 7:'Family', 8:'Fantasy', 9:'Horror', 10:'Musical', 11:'Mystery', 12:'Romance', 
                            13:'Sci-Fi', 14:'Thriller', 15:'War', 16:'Western'}, axis = 1)

In [None]:
movies_encoded = pd.concat([movies, genre_df], axis =1)

In [None]:
cleaned_df = movies_encoded.drop(columns = ['genre', 'company', 'name'])
cleaned_df.head()

In [None]:
corr = cleaned_df.corr() # making correlation matrix


fig = plt.figure(figsize = (10,8))
sea.heatmap(corr, cmap = "hot") # passing the correlation matrix into the seaborn heatmap

In [None]:
features = cleaned_df.drop(columns = ['score']) # capturing feature variables
labels = cleaned_df['score'] # capturing like and dislike labels

In [None]:
# splitting the data, using 75% for training the model, random state is set for reproducibility
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, 
                                                                            train_size = 0.75, random_state = 1)

With the training and testing sets of data, a logistic model is made to try and predict whether a movie was liked or disliked. First, the logistic model is found by passing the classes and features for the training data into the Logit function from the statsmodels package. A constant is added to the training features for this model. Next, the model results of the model can be viewed once it is fitted. Then the test features can be be passed in to get the predictions from the fitted model. Finally, these predictions are matched up with the actual likes and dislikes to evaluate the performance of the model. If the predicted values were above 0.5, then they were classified as liked. Otherwise, they were dislikes.

In [None]:
# creating the logistic regression model, adding a constant variable to the features
logit_model = sm.Logit(train_labels, sm.add_constant(train_features))

# fitting and evaluating the trained model
result = logit_model.fit()
print(result.summary())

temp = []
b = result.predict(sm.add_constant(test_features)) # have to loop through the results and sort the classes
for i in b:
    if i > 0.5: # if the prediction is higher than 0.5 then it is a like
        temp.append(1)
    else:
        temp.append(0)

print("The accuracy of the model is", metrics.accuracy_score(y_pred = temp,y_true= test_labels))
# accuracy of my model

From the results, it is evident that the accuracy of the model was not the best but it is significantly better than if the model was just left to chance. About 66% of movies were correctly guessed to have been liked or disliked. From the summary of the model, it seems that gross, runtime, budget, and year are significant in predicting movie likes. Also, one can see by the p-values that the genre categories and the constant were not very significant in the model. A simplified model can be made to try and increase accuracy without these insignificant features. After reducing the feature set and splitting the data into training and testing sets again, one can follow the previous methods to create the reduced logistic model.

In [None]:
# dropping high p value features to make a reduced model
features2 = features.drop(columns = ['Action', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 
                            'Drama', 'Family', 'Fantasy', 'Horror', 'Musical', 'Mystery', 'Romance', 
                            'Sci-Fi', 'Thriller', 'War', 'Western'])


# splitting data again, sets of data and classes for training and testing
train_features2, test_features2, train_labels2, test_labels2 = train_test_split(features2, labels, 
                                                                                train_size = 0.75, random_state = 1)


logit_model2 = sm.Logit(train_labels2, train_features2) # my logistic model with the new training data, no constant


results2 = logit_model2.fit() # fitting the new model

temp2 = []
b = results2.predict(test_features2) # have to loop through the results and sort the classes
for i in b:
    if i > 0.5: # same threshold as before
        temp2.append(1)
    else:
        temp2.append(0)

        
print(results2.summary()) # printing the new results
print("The accuracy of the new model is", metrics.accuracy_score(y_pred = temp2,y_true= test_labels2))
# accuracy of my model

After reducing the model, it can be seen that the accuracy dropped a little, although it is nearly the same value as the full model.