# Predicting Movie Likes and Dislikes

In [None]:
## All important imports go here
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import statsmodels.api as sm
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.svm import SVC
from sklearn.decomposition import PCA

In this project we are attempting to view different models' ability to predict likes and dislikes of movies. Below we have downloaded a dataframe from Kaggle that includes different information about movies from 1986-2016. The information includes things such as genre, company, the name of the movie, runtime, etc. What concerns us most is the score column that is included in this dataframe. The movies were each given a score that was ranked out of 10, 10 being the best, 1 being the worst. We want to utilize these scores to be able to build and train a model to predict movies scores accurately. 

In [None]:
movies = pd.read_csv('movies.csv', encoding = 'latin-1') #Read in this data

In [None]:
movies.head() #Display dataframe

In [None]:
#Dropping unnecessary columns 
movies = movies.drop(columns =['country', 'rating', 'released', 'votes', 'writer', 'director', 'star'], axis =1)

In [None]:
avg_score = movies['score'].mean()

In [None]:
##Binarizing the movie ratings into zeros and ones for easy classification later on
movies['score'][movies['score'] < avg_score]= 0
movies['score'][movies['score'] > avg_score]= 1

In [None]:
##Doing a linear regression to find values to replace 0.0 in the budget column 
x = movies['gross']
y = movies['budget']

x_constant = sm.add_constant(x)
gross_budget_model = sm.OLS(y, x_constant)
results = gross_budget_model.fit()
print("Intercept and slope are:", results.params)

In [None]:
m = results.params[0]
b = results.params[1]
#Replacing budgets of 0 with the budget values calculated in linear regression model
for i in range(movies.shape[0]):
    if movies['budget'][i] ==0.0:
        gross_val = movies['gross'][i]
        y = m*gross_val + b
        movies['budget'].iloc[i] = y

In [None]:
movies.head()

In [None]:
#Doing OneHotEncoder for genre labels, this allows genres to be in zeros and ones so we can use them as features
#since they are no longer strings
encoder = OneHotEncoder()
genre = movies['genre']
genre_np = genre.to_numpy()
genre_ary = encoder.fit_transform(genre_np.reshape(-1,1)).toarray()

In [None]:
genre_df = pd.DataFrame(genre_ary)

In [None]:
genre_df = genre_df.rename({0:'Action', 1:'Adventure', 2:'Animation', 3:'Biography', 4:'Comedy', 5:'Crime', 
                            6:'Drama', 7:'Family', 8:'Fantasy', 9:'Horror', 10:'Musical', 11:'Mystery', 12:'Romance', 
                            13:'Sci-Fi', 14:'Thriller', 15:'War', 16:'Western'}, axis = 1)

In [None]:
movies_encoded = pd.concat([movies, genre_df], axis =1)
movies_encoded.head() #OneHotEncoder worked

In [None]:
cleaned_df = movies_encoded.drop(columns=['genre', 'company', 'name'], axis=1)
cleaned_df.head() #Cleaned dataframe ready for modeling

## PCA

In [None]:
features = cleaned_df.drop(columns=['score'], axis =1)
labels = cleand_df['score']

In [None]:
train_vectors, test_vectors, train_labels, test_labels = train_test_split(features, labels, train_size =.75,
                                                                         test_size = .25, random_state =1)

In [None]:
##Creating and fiting PCA model, first using 10 components
pca = PCA(n_components=10, whiten=True)
pca = pca.fit(train_vectors)

##Now transforming train and test vectors into PCA train and test vectors
pca_train_vectors = pca.transform(train_vectors)
pca_test_vectors = pca.transform(test_vectors)

In [None]:
##Now fitting model using SVC with kernel rbf with default C and gamma values
pca_svm = SVC(kernel ='rbf', C=10, gamma = 0.1)
pca_model = pca_svm.fit(pca_train_vectors, train_labels)
pca_ypred = pca_model.predict(pca_test_vectors)

In [None]:
#Now printing metrics to look at accuracy of model
print('The confusion matrix is \n', confusion_matrix(test_labels, pca_ypred))
print('The classification report is \n', classification_report(test_labels, pca_ypred))