# Imports

In [74]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

%reload_ext autoreload
%autoreload 2
from src.data_utils import unzip, load_data
from src.preprocess import merge, preprocess_ratings, preprocess_movies, preprocess_users

[nltk_data] Downloading package punkt to /home/anaconda/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /home/anaconda/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /home/anaconda/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /home/anaconda/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /home/anaconda/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Prepare data

In [75]:
unzip()
users, items, ratings_train = load_data('u5.base')
users, items, ratings_test = load_data('u5.test')

users, items = preprocess_users(users), preprocess_movies(items)
ratings_train, ratings_test = preprocess_ratings(ratings_train), preprocess_ratings(ratings_test)

data_train = merge(ratings_train, users, items)
data_test = merge(ratings_test, users, items)

data_train.columns = data_train.columns.astype(str)
data_test.columns = data_test.columns.astype(str)

Successfully extracted to data/interim/


# Training

In [77]:
# dataset = data.drop(columns=['user_id', 'movie_id'])

# X = dataset[dataset.duplicated() == False].drop('rating', axis=1)
# y = dataset[dataset.duplicated() == False]['rating']
# y = y*4+1

# X.columns = X.columns.astype(str)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

train, test = data_train.drop(columns=['user_id', 'movie_id']), data_test.drop(columns=['user_id', 'movie_id'])
X_train = train.drop('rating', axis=1)
X_test = test.drop('rating', axis=1)
y_train = train['rating']
y_test = test['rating']

y_train = y_train*4+1
y_test = y_test*4+1

model = RandomForestRegressor(random_state=123)
model.fit(X_train, y_train)
predictions = model.predict(X_test)

mae = mean_absolute_error(y_test, predictions)
rmse = mean_squared_error(y_true=y_test, y_pred=predictions)**0.5

print(f'RMSE: {rmse:.4f}. MAE: {mae:.4f}')

RMSE: 1.0066. MAE: 0.8023


## Check accuracy

In [63]:
index = X_test.index
good = 0
users = set()

for i,p in zip(index, predictions):
    ui = data_test.iloc[i]
    u = ui['user_id']
    m = ui['movie_id']
    r = ui['rating']
    users.add(u)

    if ( r <= 0.5 and p <= 2.5): 
        good +=1
        print(f'User: {u}. Film: {m}. Rating: {r}. Prediction: {p}')

print(good/len(predictions))
print(f'Number of users: {len(users)}. Number of predictions: {len(predictions)}')


User: 572. Film: 222. Rating: 0.25. Prediction: 2.345
User: 21. Film: 299. Rating: 0.0. Prediction: 2.47
User: 83. Film: 301. Rating: 0.25. Prediction: 2.31
User: 281. Film: 308. Rating: 0.0. Prediction: 2.46
User: 88. Film: 313. Rating: 0.5. Prediction: 1.93
User: 3. Film: 351. Rating: 0.5. Prediction: 2.33
User: 929. Film: 271. Rating: 0.25. Prediction: 2.47
User: 82. Film: 231. Rating: 0.25. Prediction: 2.22
User: 5. Film: 377. Rating: 0.0. Prediction: 2.33
User: 283. Film: 407. Rating: 0.5. Prediction: 2.31
User: 883. Film: 407. Rating: 0.5. Prediction: 2.11
User: 869. Film: 294. Rating: 0.5. Prediction: 2.34
User: 902. Film: 294. Rating: 0.25. Prediction: 2.19
User: 815. Film: 623. Rating: 0.5. Prediction: 2.31
User: 41. Film: 289. Rating: 0.25. Prediction: 2.48
User: 910. Film: 289. Rating: 0.5. Prediction: 2.35
User: 784. Film: 321. Rating: 0.5. Prediction: 2.45
User: 839. Film: 321. Rating: 0.0. Prediction: 2.47
User: 526. Film: 312. Rating: 0.25. Prediction: 2.48
User: 640. Fi

In [41]:
(data_test['rating']>0.5).value_counts()

rating
True     11235
False     8765
Name: count, dtype: int64

# Visualization