In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE

In [None]:
df = pd.read_csv('ml-25m/final.csv')

In [None]:
df.head()

In [None]:
df_names_rating = df[['title', 'rating']]
df = df.drop(['title','genres','movieId'], axis=1)

In [None]:
df['rating'].describe()

In [None]:
def categorize_rating(rating):
    if rating >= 3.5:
        return 'good'
    elif rating >= 2.5:
        return 'ok'
    else:
        return 'bad'

In [None]:
df['rating'] = df['rating'].apply(categorize_rating)

In [None]:
df.head()

In [None]:
y = df['rating']
X = df.drop(['rating'], axis=1)

In [None]:
pca_model = PCA(n_components=len(X.columns))
pca_model.fit(X)

In [None]:
plt.figure(figsize=(10,10))
plt.plot(np.cumsum(pca_model.explained_variance_ratio_))
plt.axvline(x=694, color='r', linestyle='--')
plt.legend(['Cumulative explained variance','694 features = 0.9701 explained variance'])
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('PCA Explained Variance')
plt.savefig('pca_explained_variance.png')

In [None]:
explained_variance = np.cumsum(pca_model.explained_variance_ratio_)
for i in range(len(explained_variance)):
    if explained_variance[i] >= 0.97:
        print(i, explained_variance[i])
        break

In [None]:
pca_model = PCA(n_components=694)
df_pca = pca_model.fit_transform(X)

In [None]:
df_pca = pd.DataFrame(df_pca)

In [None]:
df_pca.to_csv('ml-25m/pca_final.csv', index=False)
y.to_csv('ml-25m/rating_only.csv', index=False)

In [None]:
TSNE_model = TSNE(n_components=2, verbose=1, perplexity=500, n_iter=300)
df_tsne = TSNE_model.fit_transform(df_pca)
plt.figure(figsize=(10,10))
plt.scatter(df_tsne[:,0], df_tsne[:,1], label=y, cmap='tab10')
plt.colorbar()
plt.show()
plt.savefig('tsne_plot.png')

In [None]:
cdict = {'bad': 'red', 'ok': 'blue', 'good': 'green'}

pca_visualization = PCA(n_components=2)
df_pca = pca_model.fit_transform(df_pca)
plt.figure(figsize=(10,10))
plt.scatter(df_pca[:,0], df_pca[:,1], c = cdict, label = y)
plt.colorbar()
plt.show()
plt.savefig('pca_plot.png')

In [None]:
plt.figure(figsize=(10,10))
y.value_counts().plot(kind='bar')
plt.savefig('rating_distribution.png')

In [None]:
y.head()