# Predicting Movie Likes and Dislikes

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import statsmodels.api as sm
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.svm import SVC
from sklearn.decomposition import PCA

In [None]:
movies = pd.read_csv('movies.csv', encoding = 'latin-1')

In [None]:
movies

In [None]:
movies = movies.drop(columns =['country', 'rating', 'released', 'votes', 'writer', 'director', 'star'], axis =1)

In [None]:
avg_score = movies['score'].mean()

In [None]:
movies['score'][movies['score'] < avg_score]= 0
movies['score'][movies['score'] > avg_score]= 1

In [None]:
x = movies['gross']
y = movies['budget']

x_constant = sm.add_constant(x)
gross_budget_model = sm.OLS(y, x_constant)
results = gross_budget_model.fit()
print("Intercept and slope are:", results.params)

In [None]:
m = results.params[0]
b = results.params[1]
#budget_df = movies[movies['budget']==0.0]
#budget_zero = budget_df['budget']
for i in range(movies.shape[0]):
    if movies['budget'][i] ==0.0:
        gross_val = movies['gross'][i]
        y = m*gross_val + b
        movies['budget'].iloc[i] = y

In [None]:
movies

In [None]:
encoder = OneHotEncoder()
genre = movies['genre']
genre_np = genre.to_numpy()
genre_ary = encoder.fit_transform(genre_np.reshape(-1,1)).toarray()

In [None]:
genre_df = pd.DataFrame(genre_ary)

In [None]:
genre_df = genre_df.rename({0:'Action', 1:'Adventure', 2:'Animation', 3:'Biography', 4:'Comedy', 5:'Crime', 
                            6:'Drama', 7:'Family', 8:'Fantasy', 9:'Horror', 10:'Musical', 11:'Mystery', 12:'Romance', 
                            13:'Sci-Fi', 14:'Thriller', 15:'War', 16:'Western'}, axis = 1)

In [None]:
movies_encoded = pd.concat([movies, genre_df], axis =1)
cleaned_df = movies_encoded 
cleaned_df

## Grid Search CV

In [None]:
#Create features and labels
features = movies['score']
labels = cleaned_df.drop(columns = ['company','name','score','genre'])
labels

In [None]:
#Split data!
ltrain, ltest, ftrain, ftest = train_test_split(features, labels, random_state=1, train_size = .75)
print(len(ftrain),len(ltrain),len(ftest),len(ltest))
ltest.shape
print(ltrain)

In [None]:
params = {'C': [1,10,100,1000], 'gamma': [.1,.01,.001,.0001], 'kernel':['rbf']}
grid = GridSearchCV(SVC(),param_grid = params)
grid.fit(ftrain,ltrain)
grid_predictions = grid.predict(ftest)
print(grid.best_params_)

In [None]:
params = {'C': [1], 'gamma': [.1], 'kernel':['rbf']}
grid = GridSearchCV(SVC(),param_grid = params)
grid.fit(ftrain,ltrain)
grid_prediction = grid.predict(ftest)
print(confusion_matrix(ltest,grid_prediction),classification_report(ltest,grid_prediction))