# KNN Model

In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

# set random state for reproducibility
kwargs = dict(random_state=42)

In [21]:
# Load Datasets
movies = pd.read_csv('../data/raw/movies.csv', sep=',')
actors = pd.read_csv('../data/raw/actors.csv', sep=',')
countries = pd.read_csv('../data/raw/countries.csv', sep=',')
directors = pd.read_csv('../data/raw/directors.csv', sep=',')
genres = pd.read_csv('../data/raw/genres.csv', sep=',')
locations = pd.read_csv('../data/raw/locations.csv', sep=',')
movie_tags = pd.read_csv('../data/raw/movie_tags.csv', sep=',')
ratings = pd.read_csv('../data/raw/ratings.csv', sep=',')
tags =  pd.read_csv('../data/raw/tags.csv', sep=',')

In [22]:
#Merging data like Christin -> created csv was 1,11GB 
movies = movies[['id', 'title', 'year']]
movies = movies.rename(columns = {'id':'movieID'})
actors.dropna()
merged_movies = pd.merge(movies, actors, how = 'outer', on='movieID')
countries.dropna()
merged_movies = pd.merge(merged_movies, countries, how = 'outer', on='movieID')

In [23]:
directors.dropna()
merged_movies = pd.merge(merged_movies, directors, how = 'outer', on='movieID')
genres.dropna()
merged_movies = pd.merge(merged_movies, genres, how = 'outer', on='movieID')
merged_movies = pd.merge(merged_movies, ratings, how='outer', on='movieID')
merged_movies = merged_movies.dropna()

In [24]:
merged_movies

Unnamed: 0,movieID,title,year,actorID,actorName,ranking,country,directorID,directorName,genre,user_id,rating
0,1,Toy story,1995.0,annie_potts,Annie Potts,10.0,USA,john_lasseter,John Lasseter,Adventure,1339.0,5.0
1,1,Toy story,1995.0,annie_potts,Annie Potts,10.0,USA,john_lasseter,John Lasseter,Adventure,551.0,3.5
2,1,Toy story,1995.0,annie_potts,Annie Potts,10.0,USA,john_lasseter,John Lasseter,Adventure,336.0,4.5
3,1,Toy story,1995.0,annie_potts,Annie Potts,10.0,USA,john_lasseter,John Lasseter,Adventure,1087.0,3.5
4,1,Toy story,1995.0,annie_potts,Annie Potts,10.0,USA,john_lasseter,John Lasseter,Adventure,1598.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...
59296576,65133,Blackadder Back & Forth,1999.0,tim_mcinnerny,Tim McInnerny,9.0,UK,paul_weiland,Paul Weiland,Comedy,480.0,5.0
59296577,65133,Blackadder Back & Forth,1999.0,tim_mcinnerny,Tim McInnerny,9.0,UK,paul_weiland,Paul Weiland,Comedy,1228.0,3.0
59296578,65133,Blackadder Back & Forth,1999.0,tony_robinson,Tony Robinson,10.0,UK,paul_weiland,Paul Weiland,Comedy,1059.0,4.0
59296579,65133,Blackadder Back & Forth,1999.0,tony_robinson,Tony Robinson,10.0,UK,paul_weiland,Paul Weiland,Comedy,480.0,5.0


In [25]:
attributes = ['movieID','title','year', 'value','user_id','rating']

In [45]:
# Restricted to 10000 Rows currently for better calculations
merged_movies = merged_movies.iloc[:10000]

X = np.array(merged_movies.iloc[:,0:11])

y = np.array(merged_movies['rating'])

In [46]:
# Checking Array dimensions
print(X.shape)
print(y.shape)

(1000, 11)
(1000,)


In [47]:
# Preprocess Data: Remove all Strings
le = preprocessing.LabelEncoder()
for i in range(0,11):
    X[:,i] = le.fit_transform(X[:,i])

In [48]:
y = le.fit_transform(y)

In [49]:
# split into training and test set
#TODO Split Data in Three and optimize for validation set to compare rmse and accuracy for Validation and Test Set!

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/5, **kwargs)


In [50]:
# Define Classifier
knn = KNeighborsClassifier(n_neighbors=44, n_jobs=-1)

In [51]:
#training Classifier
print("Training")
knn.fit(X_train, y_train)

Training


KNeighborsClassifier(n_jobs=-1, n_neighbors=44)

In [52]:
# Do predictions
print("Do Prediction")
pred = knn.predict(X_test)

Do Prediction


In [53]:
# Check Accuracy
print("Accuracy : {}".format(accuracy_score(y_test, pred)))
#RMSE for ratings [0,5]
print("RMSE : {}".format(mean_squared_error(y_test, pred, squared=False)))

Accuracy : 0.33
RMSE : 1.5842979517754858


In [44]:
#Improve Algorithm and find out good k to choose
#Limit data set to 100/1000/10000 rows for finding optimal k
#!Only execute this cell with time and computational power!

#Lets save time
limited_movies = merged_movies.iloc[:1000]

X = np.array(limited_movies.iloc[:,0:5])

y = np.array(limited_movies['rating'])

k_acc_scores = []
#start = timeit.default_timer()

for k in range(10, 100):

    print("Testing Classifier for k="+str(k))   
    knn = KNeighborsClassifier(n_neighbors=k,n_jobs=-1)
    knn.fit(X_train, y_train)
    pred = knn.predict(X_test)
    k_acc_scores.append("k({}) = Accuracy: {} RMSE: {} ".format(k, accuracy_score(y_test, pred), mean_squared_error(y_test, pred, squared=False)))
    
print(k_acc_scores)

#stop = timeit.default_timer()

#k = 44 bestes Ergebnis

Testing Classifier for k=10
Testing Classifier for k=11
Testing Classifier for k=12
Testing Classifier for k=13
Testing Classifier for k=14
Testing Classifier for k=15
Testing Classifier for k=16
Testing Classifier for k=17
Testing Classifier for k=18
Testing Classifier for k=19
Testing Classifier for k=20
Testing Classifier for k=21
Testing Classifier for k=22
Testing Classifier for k=23
Testing Classifier for k=24
Testing Classifier for k=25
Testing Classifier for k=26
Testing Classifier for k=27
Testing Classifier for k=28
Testing Classifier for k=29
Testing Classifier for k=30
Testing Classifier for k=31
Testing Classifier for k=32
Testing Classifier for k=33
Testing Classifier for k=34
Testing Classifier for k=35
Testing Classifier for k=36
Testing Classifier for k=37
Testing Classifier for k=38
Testing Classifier for k=39
Testing Classifier for k=40
Testing Classifier for k=41
Testing Classifier for k=42
Testing Classifier for k=43
Testing Classifier for k=44
Testing Classifier f