In [1]:
pip install mlxtend

Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
sb.set()
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
from scipy.spatial import distance
from scipy.spatial.distance import pdist
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering as hclust
from scipy.cluster.hierarchy import dendrogram, ward
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules 

## Classification Algorithms

### Data Preprocessing

In [3]:
# read ratings.csv into ratings
ratings = pd.read_csv('C:/movielens_20m/rating.csv')

In [4]:
# check for the null values in ratings
null_ratings = ratings.isnull().sum()
null_ratings

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [5]:
# drop the columns with names userId and timestamp
ratings.drop(columns=['userId', 'timestamp'], inplace=True)
#sort the ratings by rating and group by movieId
grouped_ratings = ratings.sort_values(by=['rating']).groupby('movieId')['rating'].mean()
# display the first 10 data of grouped_ratings
grouped_ratings.head(10)

movieId
1     3.921240
2     3.211977
3     3.151040
4     2.861393
5     3.064592
6     3.834930
7     3.366484
8     3.142049
9     3.004924
10    3.430029
Name: rating, dtype: float64

In [6]:
# read movie.csv into genres
genres = pd.read_csv('C:/movielens_20m/movie.csv')

In [7]:
# check for the null values in genres
null_genres = genres.isnull().sum()
null_genres

movieId    0
title      0
genres     0
dtype: int64

In [8]:
# drop columns genres and title, split the genres and join them together with movieId
seperated_genres = genres.drop(columns=['genres', 'title'], axis=1).join(genres.genres.str.get_dummies())
# display the first 10 data of seperated_genres
seperated_genres.head(10)

Unnamed: 0,movieId,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0,0,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,1,0,0,0,0
4,5,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,6,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
6,7,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
7,8,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,9,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,10,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [9]:
# merge the seperated genres with the grouped ratings based on movieId
movie_ratings = seperated_genres.merge(grouped_ratings, on='movieId')
# display the first 10 data of movie_ratings
movie_ratings.head(10)

Unnamed: 0,movieId,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,rating
0,1,0,0,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,3.92124
1,2,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3.211977
2,3,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,3.15104
3,4,0,0,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,0,2.861393
4,5,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,3.064592
5,6,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,3.83493
6,7,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,3.366484
7,8,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3.142049
8,9,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3.004924
9,10,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,3.430029


In [10]:
# drop the movieId from movie_ratings
movie_ratings = movie_ratings.drop('movieId', 1)

In [11]:
# Convert the rating to discrete values
movie_ratings['scale'] = pd.cut(movie_ratings['rating'], 2, labels=['Poor', 'Excellent'])
# displat the first 10 data of movie_ratings
movie_ratings.head(10)

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,...,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,rating,scale
0,0,0,1,1,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,3.92124,Excellent
1,0,0,1,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,3.211977,Excellent
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,3.15104,Excellent
3,0,0,0,0,0,1,0,0,1,0,...,0,0,0,1,0,0,0,0,2.861393,Excellent
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,3.064592,Excellent
5,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,3.83493,Excellent
6,0,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,3.366484,Excellent
7,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3.142049,Excellent
8,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3.004924,Excellent
9,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,3.430029,Excellent


### 1. Decision Tree Classification

In [12]:
# let x = all data from movie ratings except rating and scale
x = movie_ratings.iloc[:, :-2]
# Let y = scale
y = movie_ratings.iloc[:,-1]

In [13]:
# split x and y into train test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [14]:
# Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier(criterion='gini', max_depth=4, max_features=None, class_weight=None)
# build a decision tree classifier from the training set x and y
dtc.fit(x_train, y_train)

DecisionTreeClassifier(max_depth=4)

In [15]:
# predict class value for x
dtc_pred = dtc.predict(x_test)
dtc_pred

array(['Excellent', 'Excellent', 'Excellent', ..., 'Excellent',
       'Excellent', 'Excellent'], dtype=object)

#### Plot Decision Tree

In [16]:
# display all the columns names of movie_ratings
movie_ratings.columns

Index(['(no genres listed)', 'Action', 'Adventure', 'Animation', 'Children',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
       'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller',
       'War', 'Western', 'rating', 'scale'],
      dtype='object')

In [None]:
from sklearn.tree import plot_tree

# let fn = feature names
fn = ['Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 
                 'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
# let cn = class names
cn = ['Poor', 'Excellent']

# plot the decision tree
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (100,100), dpi=300)
plot_tree(dtc, feature_names=fn, class_names=cn, filled=True)

# save the decision tree plot
fig.savefig('DecisionTree.png')

#### Confusion Matrix

In [None]:
# Confusion Matrix for Decision Tree Classification
from sklearn.metrics import confusion_matrix
cmDtc = confusion_matrix(y_test, dtc_pred)
cmDtc

In [None]:
# Classification Report for Decision Tree Classification
print(classification_report(y_test, dtc_pred))

In [None]:
# plot Confusion Matrix for decision tree
df = pd.DataFrame(cmDtc, columns=['y_actual', 'y_predicted'])
confusion_matrix = pd.crosstab(df['y_actual'], df['y_predicted'], rownames=['Actual'], colnames=['Predicted'])
sb.heatmap(confusion_matrix, annot=True)
plt.show()

### 2. Logistic Regression

In [None]:
# let x = all data from movie ratings except rating and scale
x1 = movie_ratings.iloc[:, :-2]
# Let y = scale
y1 = movie_ratings.iloc[:,-1]

In [None]:
# split x and y into train test sets
x1_train, x1_test, y1_train, y1_test = train_test_split(x1, y1, test_size=0.2, random_state=42, stratify=y)

In [None]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

log = LogisticRegression(C=300.0)
# fit the model according to training set x and y
log.fit(x1_train, y1_train)

# predict class labels for samples in x
log_pred = log.predict(x1_test)
log_pred

#### Confusion Matrix

In [None]:
# Confusion Matrix for Logistic Regression
from sklearn.metrics import confusion_matrix
cmLog = confusion_matrix(y1_test, log_pred)
cmLog

In [None]:
# Classification Report for Logistic Regression
print(classification_report(y_test, log_pred))

In [None]:
# plot Confusion Matrix for Logistic Regression
df = pd.DataFrame(cmLog, columns=['y_actual', 'y_predicted'])
confusion_matrix = pd.crosstab(df['y_actual'], df['y_predicted'], rownames=['Actual'], colnames=['Predicted'])
sb.heatmap(confusion_matrix, annot=True)
plt.show()

### 3. Support Vector Machine

In [None]:
# Support Vector Machine
from sklearn.svm import SVC

# change the particular scales to particular numbers
def scale_to_number(scale):
    if scale == 'Poor':
        return 0
    if scale == 'Excellent':
        return 1

# let x2 = action and fantasy from movie ratings
x2 = movie_ratings.iloc[:, :-2]
# let y2 = scale in number 
y2 = np.array([scale_to_number(scale) for scale in movie_ratings['scale']])

In [None]:
svc = SVC(kernel='linear', C=1000.0)

x2_train, x2_test, y2_train, y2_test = train_test_split(x2, y2, test_size=0.2, random_state=42)

svc.fit(x2_train, y2_train)
svc_pred = svc.predict(x2_test)
svc_pred

#### Confusion Matrix

In [None]:
# Confusion Matrix for SVM
from sklearn.metrics import confusion_matrix
cmSvc = confusion_matrix(y2_test, svc_pred)
cmSvc

In [None]:
# Classification Report for SVM
print(classification_report(y2_test, svc_pred))

In [None]:
# plot Confusion Matrix for SVM
df = pd.DataFrame(cmSvc, columns=['y_actual', 'y_predicted'])
confusion_matrix = pd.crosstab(df['y_actual'], df['y_predicted'], rownames=['Actual'], colnames=['Predicted'])
sb.heatmap(confusion_matrix, annot=True)
plt.show()

### 4. Naive Bayes Model

In [None]:
x3 = movie_ratings.iloc[:, [2, 3]].values
y3 = movie_ratings.iloc[:, -1].values

In [None]:
# split x and y into train test sets
from sklearn.model_selection import train_test_split
x3_prac, x3_test, y3_prac, y3_test = train_test_split(x3, y3, test_size = 0.20, random_state = 42)

In [None]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
x3_prac = ss.fit_transform(x3_prac)
x3_test = ss.transform(x3_test)

In [None]:
#import Naive Bayes model to Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
#fit x_prac, y_prac with Naive bayes
classifier.fit(x3_prac, y3_prac)

In [None]:
# Predicting the Test results
y_pred = classifier.predict(x3_test)

#### Confusion Matrix

In [None]:
#import confusion matrix library for classification
from sklearn.metrics import confusion_matrix, accuracy_score
cmPred = confusion_matrix(y3_test, y_pred)

In [None]:
cmPred

In [None]:
#To print the classification report
print(classification_report(y3_test,y_pred))

In [None]:
# plot Confusion Matrix for Naive Bayesian
df = pd.DataFrame(cmPred, columns=['y_actual', 'y_predicted'])
confusion_matrix = pd.crosstab(df['y_actual'], df['y_predicted'], rownames=['Actual'], colnames=['Predicted'])
sb.heatmap(confusion_matrix, annot=True)
plt.show()

### 5. KNN Model

In [None]:
movies_df = pd.read_csv('C:/movielens_20m/movie.csv')
ratings_df = pd.read_csv('C:/movielens_20m/rating.csv')

In [None]:
movie = movies_df.loc[:,{"movieId","title"}]
rating = ratings_df.loc[:,{"userId","movieId","rating"}]

In [None]:
movies_ratings = pd.merge(movie,rating)
movies_ratings = movies_ratings.iloc[:1000000,:]
users_movies = movies_ratings.pivot_table(index = ["title"],columns = ["userId"],values = "rating").fillna(0)
users_movies.head(10)

In [None]:
# Randomly generated movie.
query_index = np.random.choice(users_movies.shape[0])
print("Generated Movie is: ",users_movies.index[query_index])

In [None]:
users_movies_matrix = csr_matrix(users_movies.values)
model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(users_movies_matrix)
distances, indices = model_knn.kneighbors(users_movies.iloc[query_index,:].values.reshape(1,-1), n_neighbors = 6)

In [None]:
movie = []
distance = []

for i in range(0, len(distances.flatten())):
    if i != 0:
        movie.append(users_movies.index[indices.flatten()[i]])
        distance.append(distances.flatten()[i])    

m = pd.Series(movie,name='movie')
d = pd.Series(distance,name='distance')
recommend = pd.concat([m,d], axis=1)
recommend = recommend.sort_values('distance',ascending=False)

print('Recommendations for {0}:\n'.format(users_movies.index[query_index]))
for i in range(0,recommend.shape[0]):
    print('{0}: {1}, with distance of {2}'.format(i, recommend["movie"].iloc[i], recommend["distance"].iloc[i]))

In [None]:
y3_test = m
knn_pred = recommend["movie"]

#### Confusion Matrix

In [None]:
# Confusion Matrix for KNN
from sklearn.metrics import confusion_matrix
cmKNN = confusion_matrix(y3_test, knn_pred)
cmKNN

In [None]:
# Classification Report for KNN
from sklearn.metrics import classification_report
print(classification_report(y3_test, knn_pred))

## Clustering Algorithms

In [None]:
movies_df = pd.read_csv('C:/movielens_20m/movie.csv')
movies_df.head()

In [None]:
#Using regular expressions to find the 'year'
#Specify the parantheses to avoid conflict terms with movies
movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))',expand=False)

#Removing the parentheses
movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)',expand=False)

#Removing the years from the 'title' column
movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '')

#Applying the strip function to get to clear whitespace characters
movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())

movies_df.head()

In [None]:
# Drop the Genre column and join with each genres in each columns 
movies_genres = movies_df.drop('genres',1).join(movies_df.genres.str.get_dummies())
movies_genres.head()

In [None]:
movies_genres = movies_genres.drop(['movieId','year'], axis=1)
movies_genres

### 1. KMeans Clustering

In [None]:
# KMeans Clustering
km = KMeans(n_clusters=10,random_state=42)
km.fit(movies_genres.iloc[:,1:20])

In [None]:
km.labels_

In [None]:
np.unique(km.labels_,return_counts=True)

In [None]:
movies_genres['kmeans_cluster_labels'] = km.labels_
movies_genres.head()

In [None]:
movies_genres.groupby('kmeans_cluster_labels').mean()

### 2. Hierarchical Clustering

In [None]:
# Hierarchical clustering (bottom-up)
# Create 10 clusters
model = hclust(affinity="euclidean",linkage="ward",n_clusters=10)

In [None]:
model.fit(movies_genres.iloc[:,1:20])

In [None]:
model.n_clusters

In [None]:
# Label on 10 clusters
model.labels_

In [None]:
# label each data points wih range 0-9
len(model.labels_)

In [None]:
# Numbering for the data points in each cluster
np.unique(model.labels_,return_counts=True)

In [None]:
Z = ward(movies_genres.iloc[:,1:20],)
Z

In [None]:
# make the column to shows cluster label for each genre
movies_genres['hclust_label'] = model.labels_
movies_genres

In [None]:
movies_genres.groupby(by='hclust_label').mean()

In [None]:
# Show for comedy clusters
movies_genres[movies_genres.hclust_label==1].head()

In [None]:
# Show for comedy clusters
movies_genres[movies_genres.hclust_label==2].head(10)

In [None]:
# To find a movie from itemset that similar with identified movie
# Example: toy story
# The result shows the movie is in the 8th of hclust_label and 6th kmeans_cluster_labels 
movies_genres[movies_genres.title.str.match("toy story",case=False)] 

In [None]:
# To show the the cluster 8 in Hierachical clustering 
movies_genres[movies_genres["hclust_label"]==8].head(10) 

In [None]:
# To show the the cluster 6 in KMeans
movies_genres[movies_genres["kmeans_cluster_labels"]==6].head(10) 

## Association Rules 

In [None]:
movies_df = pd.read_csv('C:/movielens_20m/movie.csv')
movies_df.head()

In [None]:
# Drop the Genre column and join with each genres in each columns 
movies_genres = movies_df.drop('genres',1).join(movies_df.genres.str.get_dummies())
movies_genres.set_index(['movieId','title'],inplace=True)
movies_genres.head()

In [None]:
#Assiociation rules
movies_genres2 = movies_df.drop(['movieId','genres','title'],1).join(movies_df.genres.str.get_dummies())
movies_genres2.head()

In [None]:
#itemset retrieval and filtering the results
frequent_itemsets = apriori(movies_genres, min_support = 0.02, use_colnames=True)

# To compute the no.of items inside a movie
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
frequent_itemsets.head(10)

In [None]:
# For itemset with length = 1, support >= 8%
frequent_itemsets[ (frequent_itemsets['length'] == 2) & (frequent_itemsets['support'] >= 0.08) ]

In [None]:
# For itemset with length = 2, support >= 5%
frequent_itemsets[ (frequent_itemsets['length'] == 2) & (frequent_itemsets['support'] >= 0.05) ]

In [None]:
# For itemset with length = 3, support >= 4%
frequent_itemsets[ (frequent_itemsets['length'] == 3) & (frequent_itemsets['support'] >= 0.01) ]

In [None]:
# The sample for querying the genres of an itemset
frequent_itemsets[ frequent_itemsets['itemsets'] == {'Action', 'Adventure'} ]

In [None]:
frequent_itemsets[ frequent_itemsets['itemsets'] == {'Comedy', 'Romance'} ]

In [None]:
frequent_itemsets[ frequent_itemsets['itemsets'] == {'Drama'} ]