# KNN Model

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

# set random state for reproducibility
kwargs = dict(random_state=42)

In [None]:
# Load Datasets
movies = pd.read_csv('../data/raw/movies.csv', sep=',')
actors = pd.read_csv('../data/raw/actors.csv', sep=',')
countries = pd.read_csv('../data/raw/countries.csv', sep=',')
directors = pd.read_csv('../data/raw/directors.csv', sep=',')
genres = pd.read_csv('../data/raw/genres.csv', sep=',')
locations = pd.read_csv('../data/raw/locations.csv', sep=',')
movie_tags = pd.read_csv('../data/raw/movie_tags.csv', sep=',')
ratings = pd.read_csv('../data/raw/ratings.csv', sep=',')
tags =  pd.read_csv('../data/raw/tags.csv', sep=',')

In [None]:
#Merging data like Christin -> created csv was 1,11GB 
movies = movies[['id', 'title', 'year']]
movies = movies.rename(columns = {'id':'movieID'})
actors.dropna()
merged_movies = pd.merge(movies, actors, how = 'outer', on='movieID')
countries.dropna()
merged_movies = pd.merge(merged_movies, countries, how = 'outer', on='movieID')

In [None]:
directors.dropna()
merged_movies = pd.merge(merged_movies, directors, how = 'outer', on='movieID')
genres.dropna()
merged_movies = pd.merge(merged_movies, genres, how = 'outer', on='movieID')
merged_movies = pd.merge(merged_movies, ratings, how='outer', on='movieID')
merged_movies = merged_movies.dropna()

In [None]:
merged_movies

In [None]:
attributes = ['movieID','title','year', 'value','user_id','rating']

In [None]:
limited_movies = merged_movies.iloc[:1000]
X = np.array(merged_movies.iloc[:,0:11])

y = np.array(merged_movies['rating'])

In [None]:
# Checking Array dimensions
print(X.shape)
print(y.shape)

In [17]:
# Preprocess Data: Remove all Strings
le = preprocessing.LabelEncoder()
for i in range(0,11):
    X[:,i] = le.fit_transform(X[:,i])

In [18]:
y = le.fit_transform(y)

In [None]:
# split into training and test set
#TODO Split Data in Three and optimize for validation set to compare rmse and accuracy for Validation and Test Set!

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/5, **kwargs)


In [None]:
# Define Classifier
knn = KNeighborsClassifier(n_neighbors=80, n_jobs=-1)

In [None]:
#training Classifier
print("Training")
knn.fit(X_train, y_train)

In [None]:
# Do predictions
print("Do Prediction")
pred = knn.predict(X_test)

In [None]:
# Check Accuracy
print("Accuracy : {}".format(accuracy_score(y_test, pred)))
#RMSE for ratings [0,5]
print("RMSE : {}".format(mean_squared_error(y_test, pred, squared=False)))

In [136]:
#Improve Algorithm and find out good k to choose
#Limit data set to 100/1000/10000 rows for finding optimal k
#!Only execute this cell with time and computational power!

#Lets save time
limited_movies = merged_movies.iloc[:100]

X = np.array(limited_movies.iloc[:,0:5])

y = np.array(limited_movies['rating'])

k_acc_scores = []
#start = timeit.default_timer()

for k in range(50, 100):

    print("Testing Classifier for k="+str(k))   
    knn = KNeighborsClassifier(n_neighbors=k,n_jobs=-1)
    knn.fit(X_train, y_train)
    pred = knn.predict(X_test)
    k_acc_scores.append("k({}) = Accuracy: {} RMSE: {} ".format(k, accuracy_score(y_test, pred), mean_squared_error(y_test, pred, squared=False)))
    
print(k_acc_scores)

#stop = timeit.default_timer()

#print('Computing Time: ', stop - start) 

Testing Classifier for k=50


KeyboardInterrupt: 