# Import Statements

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import KMeans
from skfuzzy.cluster import cmeans
from skfuzzy.cluster import cmeans_predict
from sklearn.metrics.pairwise import cosine_similarity

# Read Files

In [2]:
base_path = 'Dataset/ml-100k/'

In [3]:
user_data_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
user_data = pd.read_csv(base_path+'u.user', sep='|', names=user_data_cols, encoding='latin-1')

In [4]:
user_data = user_data.drop(['user_id', 'zip_code'], axis=1)

In [5]:
user_data

Unnamed: 0,age,gender,occupation
0,24,M,technician
1,53,F,other
2,23,M,writer
3,24,M,technician
4,33,F,other
...,...,...,...
938,26,F,student
939,32,M,administrator
940,20,M,student
941,48,F,librarian


In [6]:
user_data.describe()

Unnamed: 0,age
count,943.0
mean,34.051962
std,12.19274
min,7.0
25%,25.0
50%,31.0
75%,43.0
max,73.0


In [7]:
user_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 943 entries, 0 to 942
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   age         943 non-null    int64 
 1   gender      943 non-null    object
 2   occupation  943 non-null    object
dtypes: int64(1), object(2)
memory usage: 22.2+ KB


In [8]:
ratings_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings_data = pd.read_csv(base_path+'u.data', sep='\t', names=ratings_cols, encoding='latin-1')

In [9]:
ratings_data

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [10]:
ratings_matrix = np.zeros((943,1682))

In [11]:
for i in range(100000):
    ratings_matrix[ratings_data.iloc[i,0]-1][ratings_data.iloc[i,1]-1] = ratings_data.iloc[i,2]

In [12]:
ratings_matrix

array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

# Meta-data train-test split

In [13]:
user_data_fold_1 = user_data.iloc[0:188,:]
user_data_fold_2 = user_data.iloc[188:376,:]
user_data_fold_3 = user_data.iloc[376:564,:]
user_data_fold_4 = user_data.iloc[564:752,:]
user_data_fold_5 = user_data.iloc[752:943,:]

In [14]:
test_user_data = user_data_fold_2
train_user_data = pd.concat([user_data_fold_1,user_data_fold_3,user_data_fold_4,user_data_fold_5])

In [15]:
train_user_data

Unnamed: 0,age,gender,occupation
0,24,M,technician
1,53,F,other
2,23,M,writer
3,24,M,technician
4,33,F,other
...,...,...,...
938,26,F,student
939,32,M,administrator
940,20,M,student
941,48,F,librarian


In [16]:
test_user_data

Unnamed: 0,age,gender,occupation
188,32,M,artist
189,30,M,administrator
190,33,M,administrator
191,42,M,educator
192,29,M,student
...,...,...,...
371,25,F,student
372,24,F,other
373,36,M,executive
374,17,M,entertainment


In [17]:
ratings_matrix_fold_1 = ratings_matrix[0:188,:]
ratings_matrix_fold_2 = ratings_matrix[188:376,:]
ratings_matrix_fold_3 = ratings_matrix[376:564,:]
ratings_matrix_fold_4 = ratings_matrix[564:752,:]
ratings_matrix_fold_5 = ratings_matrix[752:943,:]

In [18]:
test_ratings_matrix = ratings_matrix_fold_2
train_ratings_matrix = np.vstack((ratings_matrix_fold_1,ratings_matrix_fold_3,ratings_matrix_fold_4,ratings_matrix_fold_5))

In [19]:
print(train_ratings_matrix.shape)
print(train_ratings_matrix[0])

(755, 1682)
[5. 3. 4. ... 0. 0. 0.]


In [20]:
print(test_ratings_matrix.shape)
print(test_ratings_matrix[0])

(188, 1682)
[5. 0. 0. ... 0. 0. 0.]


# Categorical to Numeric

In [21]:
kbins = KBinsDiscretizer(n_bins=5, encode='onehot', strategy='quantile')
train_user_data_age = kbins.fit_transform(train_user_data[['age']])
test_user_data_age = kbins.transform(test_user_data[['age']])
print(kbins.bin_edges_)

[array([ 7. , 23. , 29. , 35.4, 47. , 73. ])]


In [22]:
train_user_data_age = csr_matrix(train_user_data_age, dtype=np.int64).toarray()
test_user_data_age = csr_matrix(test_user_data_age, dtype=np.int64).toarray()

In [23]:
# print(type(train_user_data_age))
# print(train_user_data_age.dtype)
# print(train_user_data_age.shape)
# print(train_user_data_age[0])

In [24]:
ohe = OneHotEncoder(categories='auto', drop=None, sparse=True, dtype=np.int64, handle_unknown='ignore')
train_user_data_gender_occupation = ohe.fit_transform(train_user_data[['gender','occupation']])
test_user_data_gender_occupation = ohe.transform(test_user_data[['gender','occupation']])
print(ohe.categories_)

[array(['F', 'M'], dtype=object), array(['administrator', 'artist', 'doctor', 'educator', 'engineer',
       'entertainment', 'executive', 'healthcare', 'homemaker', 'lawyer',
       'librarian', 'marketing', 'none', 'other', 'programmer', 'retired',
       'salesman', 'scientist', 'student', 'technician', 'writer'],
      dtype=object)]


In [25]:
train_user_data_gender_occupation = csr_matrix(train_user_data_gender_occupation, dtype=np.int64).toarray()
test_user_data_gender_occupation = csr_matrix(test_user_data_gender_occupation, dtype=np.int64).toarray()

In [26]:
# print(type(train_user_data_gender_occupation))
# print(train_user_data_gender_occupation.dtype)
# print(train_user_data_gender_occupation.shape)
# print(train_user_data_gender_occupation)

In [27]:
train_user_data_numerical = np.hstack((train_user_data_age, train_user_data_gender_occupation))
test_user_data_numerical = np.hstack((test_user_data_age, test_user_data_gender_occupation))

In [28]:
print(type(train_user_data_numerical))
print(train_user_data_numerical.dtype)
print(train_user_data_numerical.shape)
print(train_user_data_numerical[0])

<class 'numpy.ndarray'>
int64
(755, 28)
[0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]


In [29]:
print(type(test_user_data_numerical))
print(test_user_data_numerical.dtype)
print(test_user_data_numerical.shape)
print(test_user_data_numerical[0])

<class 'numpy.ndarray'>
int64
(188, 28)
[0 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [30]:
train = [[] for _ in range(train_user_data_numerical.shape[0])]
for i in range(train_user_data_numerical.shape[0]):
    train[i].append(train_user_data_numerical[i])
    train[i].append(train_ratings_matrix[i])

In [31]:
print(len(train))
print(len(train[0]))
print(len(train[0][0]))
print(len(train[0][1]))
print(train[0])

755
2
28
1682
[array([0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0], dtype=int64), array([5., 3., 4., ..., 0., 0., 0.])]


In [32]:
test = [[] for _ in range(test_user_data_numerical.shape[0])]
for i in range(test_user_data_numerical.shape[0]):
    test[i].append(test_user_data_numerical[i])
    test[i].append(test_ratings_matrix[i])

In [33]:
print(len(test))
print(len(test[0]))
print(len(test[0][0]))
print(len(test[0][1]))
print(test[0])

188
2
28
1682
[array([0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0], dtype=int64), array([5., 0., 0., ..., 0., 0., 0.])]


# Fold 2 k=5,10,20

In [34]:
for k in [5,10,20]:
    # Cluster user data
    crisp_kmeans_model = KMeans(n_clusters=k, init='k-means++', n_init=10, max_iter=300, random_state=0, algorithm='full')
    train_user_data_cluster = crisp_kmeans_model.fit_transform(train_user_data_numerical)
    test_user_data_cluster = crisp_kmeans_model.transform(test_user_data_numerical)
    
    train_user_cluster_numbers = crisp_kmeans_model.labels_
    test_user_cluster_numbers = crisp_kmeans_model.predict(test_user_data_numerical)
    
    cluster_num_vs_train_users = [[] for _ in range(k)]
    for i in range(len(train_user_cluster_numbers)):
        cluster_num_vs_train_users[train_user_cluster_numbers[i]].append(i)
    
    # Predictions
    total_possible_predictions = 0
    total_actual_predictions = 0
    total_absolute_error = 0
    for a in tqdm(range(len(test))):
        threshold = 0.5
        cluster_number_of_a = test_user_cluster_numbers[a]
        neighbours={}
        for user_i in cluster_num_vs_train_users[cluster_number_of_a]:
            cos_sim_a_user_i = cosine_similarity([test[a][0]], [train[user_i][0]])[0][0]
            if(cos_sim_a_user_i>threshold):
                neighbours[user_i] = cos_sim_a_user_i
        for j in range(len(test[a][1])):
            if(test[a][1][j]==0):
                continue
            total_possible_predictions+=1
            summation = 0
            normalizing_factor = 0
            for i in neighbours.keys():
                if(train[i][1][j]==0):
                    continue
                summation += neighbours[i]*train[i][1][j]
                normalizing_factor += abs(neighbours[i])
            if(normalizing_factor==0):
                continue
            predicted_rating_a_j = round(summation/normalizing_factor)
            total_absolute_error += abs(predicted_rating_a_j - test[a][1][j])
            total_actual_predictions+=1
    coverage = (total_actual_predictions/total_possible_predictions)*100
    mean_absolute_error = total_absolute_error/total_actual_predictions
    print('Coverage: {} {}'.format(coverage, '%'))
    print('Mean Absolute Error Fold 2 k={}: {}'.format(k,mean_absolute_error))

100%|████████████████████████████████████████████████████████████████████████████████| 188/188 [00:07<00:00, 25.38it/s]


Coverage: 97.49977716374009 %
Mean Absolute Error Fold 2 k=5: 0.798784111166979


100%|████████████████████████████████████████████████████████████████████████████████| 188/188 [00:04<00:00, 42.44it/s]


Coverage: 96.87137891077636 %
Mean Absolute Error Fold 2 k=10: 0.805115936694884


100%|████████████████████████████████████████████████████████████████████████████████| 188/188 [00:02<00:00, 65.34it/s]

Coverage: 94.28202157054996 %
Mean Absolute Error Fold 2 k=20: 0.8250531789175136





# Fold 2 c=5,10,20

In [35]:
for c in [5,10,20]:
    # Cluster user data
    train_cmeans_cluster_centers, train_final_fuzzy_cpartitioned_matrix, train_initial_guess_at_cpartitioned_matrix, train_final_euclidean_distance_matrix, train_objective_function_history, train_num_of_itrs_run, train_final_fuzzy_partition_coefficient = cmeans(data=train_user_data_numerical.T, c=c, m=2, error=0.0001, maxiter=1000, init=None, seed=0)
    test_final_fuzzy_cpartitioned_matrix, test_initial_guess_at_cpartitioned_matrix, test_final_euclidean_distance_matrix, test_objective_function_history, test_num_of_itrs_run, test_final_fuzzy_partition_coefficient = cmeans_predict(test_data=test_user_data_numerical.T, cntr_trained=train_cmeans_cluster_centers, m=2, error=0.0001, maxiter=1000, init=None, seed=0)
    
    # Predictions
    total_possible_predictions = 0
    total_actual_predictions = 0
    total_absolute_error = 0
    for a in tqdm(range(len(test))):
        threshold = 0.5
        neighbours = {}
        for user_i in range(len(train)):
            cos_sim_a_user_i = cosine_similarity([test[a][0]], [train[user_i][0]])[0][0]
            if(cos_sim_a_user_i>threshold):
                neighbours[user_i] = cos_sim_a_user_i
        for j in range(len(test[a][1])):
            if(test[a][1][j]==0):
                continue
            total_possible_predictions+=1
            cluster_rating = {}
            for k in range(c):
                first_summation=0
                first_normalizing_factor=0
                for i in neighbours.keys():
                    if(train[i][1][j]==0):
                        continue
                    first_summation += train_final_fuzzy_cpartitioned_matrix[k][i]*neighbours[i]*train[i][1][j]
                    first_normalizing_factor += abs(train_final_fuzzy_cpartitioned_matrix[k][i]*neighbours[i])
                if(first_normalizing_factor==0):
                    continue
                cluster_rating[k] = first_summation/first_normalizing_factor
            second_summation=0
            second_normalizing_factor=0
            for kk in cluster_rating.keys():
                second_summation += test_final_fuzzy_cpartitioned_matrix[kk][a]*cluster_rating[kk]
                second_normalizing_factor += abs(test_final_fuzzy_cpartitioned_matrix[kk][a])
            if(second_normalizing_factor==0):
                continue
            predicted_rating_a_j = round(second_summation/second_normalizing_factor)
            total_absolute_error += abs(predicted_rating_a_j - test[a][1][j])
            total_actual_predictions+=1
    coverage = (total_actual_predictions/total_possible_predictions)*100
    mean_absolute_error = total_absolute_error/total_actual_predictions
    print('Coverage: {} {}'.format(coverage, '%'))
    print('Mean Absolute Error Fold 2 c={}: {}'.format(c,mean_absolute_error))

100%|████████████████████████████████████████████████████████████████████████████████| 188/188 [00:39<00:00,  4.77it/s]


Coverage: 98.14154559229877 %
Mean Absolute Error Fold 2 c=5: 0.7923345897098224


100%|████████████████████████████████████████████████████████████████████████████████| 188/188 [00:51<00:00,  3.66it/s]


Coverage: 98.14154559229877 %
Mean Absolute Error Fold 2 c=10: 0.7927887016938377


100%|████████████████████████████████████████████████████████████████████████████████| 188/188 [01:15<00:00,  2.49it/s]

Coverage: 98.14154559229877 %
Mean Absolute Error Fold 2 c=20: 0.7925162345034286



