In [41]:
import pandas as pd
import numpy as np

## **Items**

In [42]:
item_columns = ['item id', 'movie_title', 'release_date', 'video_release_date', 'IMDb_URL',
                'unknown', 'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime',
                'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',
                'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
item = pd.read_csv('../data/raw/ml-100k/u.item', sep='|', header=None, encoding='latin-1', 
                   names=item_columns)


video_release_date contains all NaN

In [43]:
item['video_release_date'].isna().sum() == len(item['video_release_date'])

True

Movie title, url and dates do not contain necessary information for recommendation

In [44]:
item.drop(['movie_title','release_date','video_release_date', 'IMDb_URL'], axis=1, inplace=True)
item

Unnamed: 0,item id,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,4,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1678,1679,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
1679,1680,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
1680,1681,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [45]:
item.isna().sum()

item id        0
unknown        0
Action         0
Adventure      0
Animation      0
Children's     0
Comedy         0
Crime          0
Documentary    0
Drama          0
Fantasy        0
Film-Noir      0
Horror         0
Musical        0
Mystery        0
Romance        0
Sci-Fi         0
Thriller       0
War            0
Western        0
dtype: int64

Let's combine genres into one column

In [46]:
genre_columns = item.columns.values[1:]
item['genre'] = item[genre_columns].apply(lambda row: ', '.join(row.index[row == 1]), axis=1)
item.drop(genre_columns, axis=1, inplace=True)
item

Unnamed: 0,item id,genre
0,1,"Animation, Children's, Comedy"
1,2,"Action, Adventure, Thriller"
2,3,Thriller
3,4,"Action, Comedy, Drama"
4,5,"Crime, Drama, Thriller"
...,...,...
1677,1678,Drama
1678,1679,"Romance, Thriller"
1679,1680,"Drama, Romance"
1680,1681,Comedy


## **Users**

In [47]:
user_columns = ['user id', 'age', 'gender', 'occupation', 'zip code']
user = pd.read_csv('../data/raw/ml-100k/u.user', sep='|', header=None, encoding='latin-1', names=user_columns)
user

Unnamed: 0,user id,age,gender,occupation,zip code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213
...,...,...,...,...,...
938,939,26,F,student,33319
939,940,32,M,administrator,02215
940,941,20,M,student,97229
941,942,48,F,librarian,78209


zip code also have no sence in recommentaion

In [48]:
user.drop(['zip code'], axis=1, inplace=True)
user

Unnamed: 0,user id,age,gender,occupation
0,1,24,M,technician
1,2,53,F,other
2,3,23,M,writer
3,4,24,M,technician
4,5,33,F,other
...,...,...,...,...
938,939,26,F,student
939,940,32,M,administrator
940,941,20,M,student
941,942,48,F,librarian


In [49]:
user.isna().sum()

user id       0
age           0
gender        0
occupation    0
dtype: int64

Let's divide ages into groups, it may increase score, for example childs are more likely would watch cartoons(in our dataset genres:[Animation,	Children's])

In [50]:
import numpy as np

user['age_group'] = np.digitize(user['age'], bins=[0, 12, 18, 30, 45, float('inf')], right=True)

# Mapping the digitized values to age groups
age_group_mapping = {
    1: 'child',
    2: 'teenage',
    3: 'adults',
    4: 'middle age',
    5: 'old adults'
}

user['age_group'] = user['age_group'].map(age_group_mapping)
user.drop(['age'], axis=1, inplace=True)
user

Unnamed: 0,user id,gender,occupation,age_group
0,1,M,technician,adults
1,2,F,other,old adults
2,3,M,writer,adults
3,4,M,technician,adults
4,5,F,other,middle age
...,...,...,...,...
938,939,F,student,adults
939,940,M,administrator,middle age
940,941,M,student,adults
941,942,F,librarian,old adults


## **Data**

In [51]:
data_columns = ['user id', 'item id', 'rating', 'timestamp']
data = pd.read_csv('../data/raw/ml-100k/u.data', sep='\t', header=None, encoding='latin-1', names=data_columns)
data

Unnamed: 0,user id,item id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


no duplicates, and no user rated the same film many times

In [52]:
print(data.duplicated().sum())
print(data.duplicated(subset=['user id', 'item id']).sum())

0
0


In [53]:
merged = pd.merge(data, item, how='left', on='item id')
merged = pd.merge(merged, user, how='left', on='user id')
merged = merged.sort_values(by=["user id", 'timestamp'])
merged.reset_index(drop=True, inplace=True)
merged

Unnamed: 0,user id,item id,rating,timestamp,genre,gender,occupation,age_group
0,1,168,5,874965478,Comedy,M,technician,adults
1,1,172,5,874965478,"Action, Adventure, Drama, Romance, Sci-Fi, War",M,technician,adults
2,1,165,5,874965518,Drama,M,technician,adults
3,1,156,4,874965556,"Crime, Thriller",M,technician,adults
4,1,196,5,874965677,Drama,M,technician,adults
...,...,...,...,...,...,...,...,...
99995,943,449,1,888693158,"Action, Adventure, Sci-Fi",M,student,adults
99996,943,229,2,888693158,"Action, Adventure, Sci-Fi",M,student,adults
99997,943,230,1,888693158,"Action, Adventure, Sci-Fi",M,student,adults
99998,943,228,3,888693158,"Action, Adventure, Sci-Fi",M,student,adults


In [54]:
merged.drop(['timestamp'], axis=1, inplace=True)

In [55]:
merged

Unnamed: 0,user id,item id,rating,genre,gender,occupation,age_group
0,1,168,5,Comedy,M,technician,adults
1,1,172,5,"Action, Adventure, Drama, Romance, Sci-Fi, War",M,technician,adults
2,1,165,5,Drama,M,technician,adults
3,1,156,4,"Crime, Thriller",M,technician,adults
4,1,196,5,Drama,M,technician,adults
...,...,...,...,...,...,...,...
99995,943,449,1,"Action, Adventure, Sci-Fi",M,student,adults
99996,943,229,2,"Action, Adventure, Sci-Fi",M,student,adults
99997,943,230,1,"Action, Adventure, Sci-Fi",M,student,adults
99998,943,228,3,"Action, Adventure, Sci-Fi",M,student,adults


In [56]:
merged.to_csv('../data/interim/100k_clean.csv')