In [1]:
import numpy as np
import pandas as pd
# Reading ratings file
ratings = pd.read_csv('data/ratings2.csv',  encoding='latin-1',
                      usecols=['user_id', 'movie_id', 'rating', 'timestamp'])

# Reading users file
users = pd.read_csv('data/users.csv', encoding='latin-1',
                    usecols=['user_id', 'gender', 'zipcode', 'age_desc', 'occ_desc'])

# Reading movies file
movies = pd.read_csv('data/movies.csv',  encoding='latin-1',
                     usecols=['movie_id', 'title', 'genres'])


In [2]:
users

Unnamed: 0,user_id,gender,zipcode,age_desc,occ_desc
0,1,F,48067,Under 18,K-12 student
1,2,M,70072,56+,self-employed
2,3,M,55117,25-34,scientist
3,4,M,02460,45-49,executive/managerial
4,5,M,55455,25-34,writer
...,...,...,...,...,...
6035,6036,F,32603,25-34,scientist
6036,6037,F,76006,45-49,academic/educator
6037,6038,F,14706,56+,academic/educator
6038,6039,F,01060,45-49,other or not specified


In [3]:
user_counts = ratings['user_id'].value_counts()
active_users = user_counts[user_counts > 100].index.tolist()
active_ratings = ratings[ratings['user_id'].isin(active_users)]
merge_ratings = pd.merge(active_ratings, movies, on='movie_id')
merge_ratings['genres'] = merge_ratings['genres'].str.split('|')
merge_ratings = merge_ratings.explode('genres')

ratings_pivot = pd.pivot_table(
    merge_ratings, values='rating', index='user_id', columns='genres', aggfunc=np.mean)
ratings_pivot = ratings_pivot.fillna(0)
ratings_pivot

genres,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2,3.500000,3.736842,0.000000,0.000000,3.560000,3.583333,0.000000,3.898734,3.000000,4.000000,3.000000,0.000000,3.333333,3.708333,3.588235,3.483871,3.733333,4.333333
5,2.612903,3.000000,4.000000,3.833333,3.410714,3.285714,3.666667,3.096154,0.000000,4.000000,2.800000,3.333333,3.125000,3.100000,3.066667,2.846154,3.500000,4.000000
8,4.027778,3.888889,4.250000,3.000000,3.888889,4.000000,0.000000,3.814815,0.000000,0.000000,5.000000,4.000000,3.500000,4.066667,4.461538,4.034483,4.200000,4.000000
9,3.500000,3.428571,4.111111,4.200000,3.740741,3.750000,3.500000,3.888889,3.000000,4.000000,2.000000,0.000000,4.000000,3.555556,3.888889,3.629630,4.142857,5.000000
10,3.913580,4.114286,4.303030,4.109375,4.136612,3.600000,4.000000,4.181034,4.315789,3.500000,4.000000,4.432432,3.625000,4.373134,4.183099,4.222222,3.833333,3.875000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6035,2.634921,3.000000,3.250000,2.035714,2.246154,3.150000,0.000000,3.116071,3.083333,3.000000,2.333333,1.500000,3.300000,3.047619,2.762500,1.972973,3.307692,2.333333
6036,3.000000,2.987952,3.911765,3.444444,3.203065,3.528302,3.909091,3.505376,3.000000,4.058824,2.986486,3.709677,3.411765,3.352459,2.834320,3.142857,3.785714,3.642857
6037,3.642857,4.000000,4.000000,3.666667,3.576271,3.833333,4.000000,3.877551,4.250000,3.444444,4.111111,4.000000,3.692308,3.681818,3.692308,3.705882,4.000000,3.750000
6039,4.000000,4.100000,3.615385,3.529412,3.723077,4.000000,0.000000,4.000000,3.600000,4.500000,4.000000,3.690476,4.176471,3.800000,4.250000,4.142857,4.111111,4.500000


In [4]:
count_df = merge_ratings.groupby(['user_id', 'genres']).size().unstack(fill_value=0)
sum_df = count_df.sum(axis=1)
frac_df = count_df.div(sum_df, axis=0)
frac_df = frac_df.fillna(0)

frac_df

genres,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2,0.194444,0.065972,0.000000,0.000000,0.086806,0.041667,0.000000,0.274306,0.003472,0.003472,0.006944,0.000000,0.010417,0.083333,0.059028,0.107639,0.052083,0.010417
5,0.088068,0.025568,0.011364,0.017045,0.159091,0.059659,0.017045,0.295455,0.000000,0.008523,0.028409,0.008523,0.022727,0.085227,0.042614,0.110795,0.017045,0.002841
8,0.128571,0.032143,0.014286,0.010714,0.064286,0.046429,0.000000,0.385714,0.000000,0.000000,0.003571,0.003571,0.014286,0.107143,0.046429,0.103571,0.035714,0.003571
9,0.097087,0.033981,0.043689,0.024272,0.131068,0.077670,0.009709,0.262136,0.004854,0.004854,0.004854,0.000000,0.004854,0.087379,0.043689,0.131068,0.033981,0.004854
10,0.093426,0.080738,0.038062,0.073818,0.211073,0.017301,0.002307,0.133795,0.043829,0.004614,0.021915,0.042676,0.009227,0.077278,0.081892,0.031142,0.027682,0.009227
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6035,0.104132,0.056198,0.006612,0.046281,0.214876,0.033058,0.000000,0.185124,0.019835,0.004959,0.014876,0.003306,0.016529,0.069421,0.132231,0.061157,0.021488,0.009917
6036,0.091070,0.049083,0.020106,0.031934,0.154347,0.031342,0.006505,0.219988,0.011236,0.010053,0.043761,0.018332,0.020106,0.072147,0.099941,0.086931,0.024837,0.008279
6037,0.068627,0.022059,0.002451,0.014706,0.144608,0.044118,0.002451,0.240196,0.009804,0.022059,0.022059,0.009804,0.031863,0.053922,0.095588,0.166667,0.039216,0.009804
6039,0.028881,0.036101,0.046931,0.061372,0.234657,0.007220,0.000000,0.101083,0.018051,0.021661,0.003610,0.151625,0.061372,0.108303,0.028881,0.050542,0.032491,0.007220


In [5]:
user_genres_info = pd.concat([frac_df, ratings_pivot], axis=1)
user_genres_info = pd.merge(user_genres_info, users, on='user_id')
age_mapping = {
    'Under 18': 0,
    '18-24': 1,
    '25-34': 2,
    '35-44': 3,
    '45-49': 4,
    '50-55': 5,
    '56+': 6
}
user_genres_info['age'] = user_genres_info['age_desc'].map(age_mapping)
gender_mapping = {
    'F': 0,
    'M': 1
}
user_genres_info['gender'] = user_genres_info['gender'].map(gender_mapping)

user_genres_info


Unnamed: 0,user_id,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,...,Romance,Sci-Fi,Thriller,War,Western,gender,zipcode,age_desc,occ_desc,age
0,2,0.194444,0.065972,0.000000,0.000000,0.086806,0.041667,0.000000,0.274306,0.003472,...,3.708333,3.588235,3.483871,3.733333,4.333333,1,70072,56+,self-employed,6
1,5,0.088068,0.025568,0.011364,0.017045,0.159091,0.059659,0.017045,0.295455,0.000000,...,3.100000,3.066667,2.846154,3.500000,4.000000,1,55455,25-34,writer,2
2,8,0.128571,0.032143,0.014286,0.010714,0.064286,0.046429,0.000000,0.385714,0.000000,...,4.066667,4.461538,4.034483,4.200000,4.000000,1,11413,25-34,programmer,2
3,9,0.097087,0.033981,0.043689,0.024272,0.131068,0.077670,0.009709,0.262136,0.004854,...,3.555556,3.888889,3.629630,4.142857,5.000000,1,61614,25-34,technician/engineer,2
4,10,0.093426,0.080738,0.038062,0.073818,0.211073,0.017301,0.002307,0.133795,0.043829,...,4.373134,4.183099,4.222222,3.833333,3.875000,0,95370,35-44,academic/educator,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2904,6035,0.104132,0.056198,0.006612,0.046281,0.214876,0.033058,0.000000,0.185124,0.019835,...,3.047619,2.762500,1.972973,3.307692,2.333333,0,78734,25-34,academic/educator,2
2905,6036,0.091070,0.049083,0.020106,0.031934,0.154347,0.031342,0.006505,0.219988,0.011236,...,3.352459,2.834320,3.142857,3.785714,3.642857,0,32603,25-34,scientist,2
2906,6037,0.068627,0.022059,0.002451,0.014706,0.144608,0.044118,0.002451,0.240196,0.009804,...,3.681818,3.692308,3.705882,4.000000,3.750000,0,76006,45-49,academic/educator,4
2907,6039,0.028881,0.036101,0.046931,0.061372,0.234657,0.007220,0.000000,0.101083,0.018051,...,3.800000,4.250000,4.142857,4.111111,4.500000,0,01060,45-49,other or not specified,4


In [6]:
from sklearn.model_selection import train_test_split


X_gender = user_genres_info.iloc[:, 1:37].values
y_gender = np.asarray(user_genres_info['gender'].values)

X_age = user_genres_info.iloc[:, 1:37].values
y_age = np.asarray(user_genres_info['age'].values)

X_gender_train, X_gender_test, y_gender_train, y_gender_test = train_test_split(
    X_gender, y_gender, test_size=0.25)
X_age_train, X_age_test, y_age_train, y_age_test = train_test_split(
    X_age, y_age, test_size=0.25)


In [7]:
import torch
import torch.nn as nn
import torch.optim as optim


class Net(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x



In [8]:
gender_net = Net(36, 18, 2)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(gender_net.parameters(), lr=0.01)

for epoch in range(1000):
    # Forward pass
    outputs = gender_net(torch.FloatTensor(X_gender_train))
    loss = criterion(outputs, torch.LongTensor(y_gender_train))

    # Backward and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Print loss after every 100 epochs
    if (epoch+1) % 100 == 0:
        print(f"Epoch [{epoch+1}/1000], Loss: {loss.item():.4f}")

# Evaluate the trained model on the test set
with torch.no_grad():
    outputs = gender_net(torch.FloatTensor(X_gender_test))
    _, predicted = torch.max(outputs.data, 1)
    accuracy = (predicted == torch.LongTensor(
        y_gender_test)).sum().item() / len(y_gender_test)
    print(f"Accuracy on test set: {accuracy:.4f}")


Epoch [100/1000], Loss: 0.4508
Epoch [200/1000], Loss: 0.4075
Epoch [300/1000], Loss: 0.3920
Epoch [400/1000], Loss: 0.3813
Epoch [500/1000], Loss: 0.3709
Epoch [600/1000], Loss: 0.3683
Epoch [700/1000], Loss: 0.3650
Epoch [800/1000], Loss: 0.3663
Epoch [900/1000], Loss: 0.3614
Epoch [1000/1000], Loss: 0.3564
Accuracy on test set: 0.8132


In [10]:
age_net = Net(36, 18, 7)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(age_net.parameters(), lr=0.01)

for epoch in range(1000):
    # Forward pass
    outputs = age_net(torch.FloatTensor(X_age_train))
    loss = criterion(outputs, torch.LongTensor(y_age_train))

    # Backward and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Print loss after every 100 epochs
    if (epoch+1) % 100 == 0:
        print(f"Epoch [{epoch+1}/1000], Loss: {loss.item():.4f}")

# Evaluate the trained model on the test set
with torch.no_grad():
    outputs = age_net(torch.FloatTensor(X_age_test))
    _, predicted = torch.max(outputs.data, 1)
    accuracy = (predicted == torch.LongTensor(
        y_age_test)).sum().item() / len(y_age_test)
    print(f"Accuracy on test set: {accuracy:.4f}")


Epoch [100/1000], Loss: 1.5947
Epoch [200/1000], Loss: 1.5202
Epoch [300/1000], Loss: 1.4754
Epoch [400/1000], Loss: 1.4500
Epoch [500/1000], Loss: 1.4379
Epoch [600/1000], Loss: 1.4310
Epoch [700/1000], Loss: 1.4259
Epoch [800/1000], Loss: 1.4204
Epoch [900/1000], Loss: 1.4151
Epoch [1000/1000], Loss: 1.4082
Accuracy on test set: 0.3901
