# KNN Model

In [80]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

# set random state for reproducibility
kwargs = dict(random_state=42)

In [67]:
# Load Datasets
movies = pd.read_csv('../data/raw/movies.csv', sep=',')
actors = pd.read_csv('../data/raw/actors.csv', sep=',')
countries = pd.read_csv('../data/raw/countries.csv', sep=',')
directors = pd.read_csv('../data/raw/directors.csv', sep=',')
genres = pd.read_csv('../data/raw/genres.csv', sep=',')
locations = pd.read_csv('../data/raw/locations.csv', sep=',')
movie_tags = pd.read_csv('../data/raw/movie_tags.csv', sep=',')
ratings = pd.read_csv('../data/raw/ratings.csv', sep=',')
tags =  pd.read_csv('../data/raw/tags.csv', sep=',')

In [68]:
#Merging data like Christin -> created csv was 1,11GB 
tags = tags.rename(columns = {'id':'tagID'})
tags_movies_merged = pd.merge(movie_tags, tags, how = 'outer', on = 'tagID')
tags_new = tags_movies_merged.dropna()
movies = movies[['id', 'title', 'year']]
movies = movies.rename(columns = {'id':'movieID'})
merged_movies = pd.merge(movies, tags_new, how = 'outer', on='movieID')
merged_movies = pd.merge(merged_movies, ratings, how='outer', on='movieID')
merged_movies = merged_movies.dropna()

In [69]:
merged_movies

Unnamed: 0,movieID,title,year,tagID,tagWeight,value,user_id,rating
0,1,Toy story,1995.0,7.0,1.0,funny,1339.0,5.0
1,1,Toy story,1995.0,7.0,1.0,funny,551.0,3.5
2,1,Toy story,1995.0,7.0,1.0,funny,336.0,4.5
3,1,Toy story,1995.0,7.0,1.0,funny,1087.0,3.5
4,1,Toy story,1995.0,7.0,1.0,funny,1598.0,4.0
...,...,...,...,...,...,...,...,...
15663059,65126,Choke,2008.0,5281.0,1.0,based on book,1273.0,3.0
15663060,65126,Choke,2008.0,5281.0,1.0,based on book,599.0,3.5
15663061,65126,Choke,2008.0,13168.0,1.0,chuck palahniuk,1273.0,3.0
15663062,65126,Choke,2008.0,13168.0,1.0,chuck palahniuk,599.0,3.5


In [71]:
merged_movies = merged_movies.drop(['tagID', 'tagWeight'], axis=1)

In [72]:
merged_movies

Unnamed: 0,movieID,title,year,value,user_id,rating
0,1,Toy story,1995.0,funny,1339.0,5.0
1,1,Toy story,1995.0,funny,551.0,3.5
2,1,Toy story,1995.0,funny,336.0,4.5
3,1,Toy story,1995.0,funny,1087.0,3.5
4,1,Toy story,1995.0,funny,1598.0,4.0
...,...,...,...,...,...,...
15663059,65126,Choke,2008.0,based on book,1273.0,3.0
15663060,65126,Choke,2008.0,based on book,599.0,3.5
15663061,65126,Choke,2008.0,chuck palahniuk,1273.0,3.0
15663062,65126,Choke,2008.0,chuck palahniuk,599.0,3.5


In [73]:
attributes = ['movieID','title','year', 'value','user_id','rating']

In [105]:
X = np.array(merged_movies.iloc[:,0:5])

y = np.array(merged_movies['rating'])

In [106]:
# Checking Array dimensions
print(X.shape)
print(y.shape)

(15630145, 5)
(15630145,)


In [107]:
# Preprocess Data: Remove all Strings
le = preprocessing.LabelEncoder()
for i in range(0,5):
    X[:,i] = le.fit_transform(X[:,i])

In [108]:
y = le.fit_transform(y)

In [114]:
# split into training and test set
#TODO Split Data in Three and optimize for validation set to compare rmse and accuracy for Validation and Test Set!

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/5, **kwargs)


In [115]:
# Define Classifier
knn = KNeighborsClassifier(n_neighbors=100,n_jobs=-1)

In [116]:
#training Classifier
print("Training")
knn.fit(X_train, y_train)

Training


KNeighborsClassifier(n_jobs=-1, n_neighbors=100)

In [117]:
# Do predictions
print("Do Prediction")
pred = knn.predict(X_test)

Do Prediction


In [118]:
# Check Accuracy
print("Accuracy : {}".format(accuracy_score(y_test, pred)))
#RMSE for ratings [0,5]
print("RMSE : {}".format(mean_squared_error(y_test, pred, squared=False)))

Accuracy : 0.2935027154258646
RMSE : 1.9612536346100242


In [None]:
#Improve Algorithm and find out good k to choose
#Limit data set to 10000 rows for finding optimal k
#!Only execute this cell with time and computational power!

limited_movies = merged_movies.iloc[:10000]

X = np.array(limited_movies.iloc[:,0:5])

y = np.array(limited_movies['rating'])

k_acc_scores = []
#start = timeit.default_timer()

for k in range(50, 60):

    print("Testing Classifier for k="+str(k))   
    knn = KNeighborsClassifier(n_neighbors=k,n_jobs=-1)
    knn.fit(X_train, y_train)
    pred = knn.predict(X_test)
    k_acc_scores.append("k({}) = Accuracy: {} RMSE: {} ".format(k, accuracy_score(y_test, pred), mean_squared_error(y_test, pred, squared=False)))
    
print(k_acc_scores)

#stop = timeit.default_timer()

#print('Computing Time: ', stop - start) 

Testing Classifier for k=50
