In [1]:
# %load Extract_MovieLens_Data.py


# # This notebook is to extract data from Movie Lens
# * The data contents are explained in http://files.grouplens.org/papers/ml-1m-README.txt
# 
# ## users.dat
# 
# UserID::Gender::Age::Occupation::Zip-code
# - Gender is denoted by a "M" for male and "F" for female
# - Age is chosen from the following ranges:
# 
# 	*  1:  "Under 18"
# 	* 18:  "18-24"
# 	* 25:  "25-34"
# 	* 35:  "35-44"
# 	* 45:  "45-49"
# 	* 50:  "50-55"
# 	* 56:  "56+"
# 
# - Occupation is chosen from the following choices:
# 
# 	*  0:  "other" or not specified
# 	*  1:  "academic/educator"
# 	*  2:  "artist"
# 	*  3:  "clerical/admin"
# 	*  4:  "college/grad student"
# 	*  5:  "customer service"
# 	*  6:  "doctor/health care"
# 	*  7:  "executive/managerial"
# 	*  8:  "farmer"
# 	*  9:  "homemaker"
# 	* 10:  "K-12 student"
# 	* 11:  "lawyer"
# 	* 12:  "programmer"
# 	* 13:  "retired"
# 	* 14:  "sales/marketing"
# 	* 15:  "scientist"
# 	* 16:  "self-employed"
# 	* 17:  "technician/engineer"
# 	* 18:  "tradesman/craftsman"
# 	* 19:  "unemployed"
# 	* 20:  "writer"
# 
# ## movies.dat
# MovieID::Title::Genres
# 
# ## ratings.dat
# UserID::MovieID::Rating::Timestamp

# In[1]:

import pandas as pd

# In[3]:

users_df = pd.read_csv("./ml-1m/users.dat"
                    , sep='::'
                    , header=None
                    , names=["UserID", "Gender", "Age", "Occupation", "Zip-code"])
ocupation_codes = {'ocupation_code': [x for x in range(21)]
                   , 'Occupation_name': ["other or not specified", "academic/educator", "artist"
                                  , "clerical/admin", "college/grad student", "customer service"
                                  , "doctor/health care", "executive/managerial", "farmer"
                                  , "homemaker", "K-12 student", "lawyer", "programmer", "retired"
                                  , "sales/marketing" ,"scientist", "self-employed", "technician/engineer"
                                  , "tradesman/craftsman", "unemployed", "writer"]
                  }
ocupation_codes = pd.DataFrame(ocupation_codes)
users_df = users_df.merge(ocupation_codes, left_on=["Occupation"], right_on=["ocupation_code"], how='left')
users_df = users_df.drop(["Occupation", "ocupation_code"], axis=1).rename(columns={'Occupation_name': 'Occupation'})


# In[4]:

movies_org_df = pd.read_csv("./ml-1m/movies.dat"
                            , sep='::'
                            , header=None
                            , names=["MovieID", "Title", "Genres"])
rows = []
for _, row in movies_org_df.iterrows():
    for gen in row.Genres.split('|'):
        rows.append([row['MovieID'], row['Title'], gen])
movies_df = pd.DataFrame(rows, columns=movies_org_df.columns)


# In[5]:

ratings_df = pd.read_csv("./ml-1m/ratings.dat"
                         , sep='::'
                         , header=None
                         , names=["UserID", "MovieID", "Rating", "Timestamp"])
ratings_df['rating_dt'] = pd.to_datetime(ratings_df['Timestamp'],unit='s')



In [2]:
print(users_df.columns)
print(movies_df.columns)
print(ratings_df.columns)

Index(['UserID', 'Gender', 'Age', 'Zip-code', 'Occupation'], dtype='object')
Index(['MovieID', 'Title', 'Genres'], dtype='object')
Index(['UserID', 'MovieID', 'Rating', 'Timestamp', 'rating_dt'], dtype='object')


In [3]:
df = ratings_df.merge(users_df, on=['UserID'], how='left').merge(movies_df, on=['MovieID'], how='left')

In [None]:
df.columns

In [9]:
import datetime as dt
import numpy as np

In [None]:
ratings_2001_Q3 = df[(df.rating_dt < dt.datetime(2000, 10, 1)) & (df.rating_dt >= dt.datetime(2000, 7, 1))]

In [None]:
ratings_2001_Q3[["Rating_mean", "Age_mean"]] = ratings_2001_Q3.groupby(["Genres", "Gender", "Occupation"])[['Rating', 'Age']] \
.transform(np.mean).rename(columns={
    "Rating": "Rating_mean"
    , "Age": "Age_mean"
})

In [None]:
a = ratings_2001_Q3.groupby(["Genres", "Gender", "Occupation"])[['Rating', 'Age']].agg([np.mean, np.std])
a = a.reset_index()
# a.columns = a.columns.get_level_values(0)
a.columns.values
a.columns = [x[:-1] if x.endswith('_') else x for x in ['_'.join(x) for x in a.columns.values]]
a

In [None]:
#df["Rating_mean"] = df.groupby(["Occupation", "Genres"])['Rating'].transform(np.mean)
#df.sort_values(by="Rating_mean", ascending=False).groupby(["Occupation"]).head(5)
# df.groupby(["Occupation", "Genres"])['Rating'].agg(np.mean).reset_index().rename(columns={"Rating": "Rating_avg"}).sort_values(by="Rating_avg", ascending=False).groupby(["Occupation"]).head(5).sort_values(by=["Occupation", "Rating_avg"])
df.groupby(["Occupation", "Genres"])['Rating'].agg(np.mean).reset_index().rename(columns={"Rating": "Rating_avg"}).sort_values(by="Rating_avg", ascending=False).groupby(["Occupation"]).head(5).sort_values(by=["Occupation", "Rating_avg"])

In [None]:
# Find top 5 ratings for each occupation
#df = df.groupby(['Occupation', 'Genres']).agg({'Rating': np.mean}).reset_index()
df.assign(Rating_avg=df.groupby(['Occupation', 'Genres'])[['Rating']].transform(np.mean))[['Occupation', 'Genres', 'Rating_avg']] \
.drop_duplicates() \
.sort_values(by="Rating_avg", ascending=False) \
.groupby(["Occupation"]).head(5).sort_values(by=["Occupation", "Rating_avg"])

In [None]:
# df.groupby(["Occupation", "Genres"]).transform(lambda x : x if print(type(x)) else x)
# df.groupby(["Occupation", "Genres"])['Rating'].transform(lambda x : x if print(len(x)) else x)

In [27]:
# df.groupby(["Occupation", "Genres"])['Rating'].transform(lambda x: x.mean())
# df.groupby(["Occupation", "Genres"]).transform(lambda x: x.mean())
# df.groupby(["Occupation", "Genres"])['Rating'].apply(lambda x: x.mean())
# df.groupby(["Occupation", "Genres"])['Rating'].agg(lambda x: x.mean())

Unnamed: 0_level_0,Unnamed: 1_level_0,UserID,MovieID,Rating,Timestamp,Age
Occupation,Genres,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
K-12 student,Action,2532.905884,1767.276084,3.497116,9.747699e+08,4.708917
K-12 student,Adventure,2538.595813,1492.607354,3.425658,9.745419e+08,4.522544
K-12 student,Animation,2576.597967,1773.637246,3.463956,9.743501e+08,2.849815
K-12 student,Children's,2586.375772,1518.847479,3.220679,9.746313e+08,2.698560
K-12 student,Comedy,2545.116746,1891.971896,3.497200,9.765825e+08,3.963867
K-12 student,Crime,2429.580812,1671.200000,3.687085,9.781680e+08,5.019926
K-12 student,Documentary,2478.510204,2253.989796,3.581633,9.796865e+08,4.510204
K-12 student,Drama,2433.251333,1907.594500,3.782167,9.798538e+08,4.899167
K-12 student,Fantasy,2653.578824,1560.192941,3.298039,9.740892e+08,4.365490
K-12 student,Film-Noir,2389.336170,1751.102128,4.212766,9.798338e+08,5.029787


In [17]:
df.groupby(["Occupation", "Genres"]).transform(lambda x: x if print(len(x)) else x)

6067
6067
3726
2164
3888
9465
1355
98
6000
1275
235
1905
1410
732
2990
3932
4212
1385
300
18673
10215
3242
5600
29949
6326
794
34769
2763
1852
5079
3847
3730
14433
11795
14723
6006
1715
11277
5986
2163
3375
17982
3957
553
19273
1760
1000
3676
2387
2026
7824
7110
8683
3072
973
7078
3700
1321
2226
11870
2469
285
11533
1043
677
2210
1518
1356
5210
4022
5922
1950
659
35152
17910
7036
11264
48672
10772
970
43546
5225
1770
10383
5442
4740
18913
20352
25302
8205
2091
6493
3228
974
1597
7908
1734
97
6369
934
329
2390
888
718
2649
3789
4238
1348
546
8601
4446
1565
2596
13148
2800
302
14332
1216
705
2576
1646
1469
5971
5146
6781
2572
717
28863
14138
3451
6251
35784
8738
707
38888
3664
1865
7479
3881
4270
15453
16316
20654
7981
2695
868
528
134
261
939
198
10
835
152
41
182
100
82
373
550
497
164
51
2098
1299
693
1281
5065
699
35
4152
409
130
542
694
391
2681
1065
1710
664
191
4793
2563
726
1209
7526
1795
191
7832
610
537
1249
874
920
2989
2776
3632
1585
535
32957
16919
5583
9572
46500
10460
1155

Unnamed: 0,UserID,MovieID,Rating,Timestamp,rating_dt,Gender,Age,Zip-code,Occupation,Title,Genres
0,1,1193,5,978300760,2000-12-31 22:12:40,F,1,48067,K-12 student,One Flew Over the Cuckoo's Nest (1975),Drama
1,1,661,3,978302109,2000-12-31 22:35:09,F,1,48067,K-12 student,James and the Giant Peach (1996),Animation
2,1,661,3,978302109,2000-12-31 22:35:09,F,1,48067,K-12 student,James and the Giant Peach (1996),Children's
3,1,661,3,978302109,2000-12-31 22:35:09,F,1,48067,K-12 student,James and the Giant Peach (1996),Musical
4,1,914,3,978301968,2000-12-31 22:32:48,F,1,48067,K-12 student,My Fair Lady (1964),Musical
5,1,914,3,978301968,2000-12-31 22:32:48,F,1,48067,K-12 student,My Fair Lady (1964),Romance
6,1,3408,4,978300275,2000-12-31 22:04:35,F,1,48067,K-12 student,Erin Brockovich (2000),Drama
7,1,2355,5,978824291,2001-01-06 23:38:11,F,1,48067,K-12 student,"Bug's Life, A (1998)",Animation
8,1,2355,5,978824291,2001-01-06 23:38:11,F,1,48067,K-12 student,"Bug's Life, A (1998)",Children's
9,1,2355,5,978824291,2001-01-06 23:38:11,F,1,48067,K-12 student,"Bug's Life, A (1998)",Comedy


In [13]:
df.assign(max_ratings = ratings_df.groupby("MovieID")['Rating'].transform(np.max))

Unnamed: 0,UserID,MovieID,Rating,Timestamp,rating_dt,Gender,Age,Zip-code,Occupation,Title,Genres,max_ratings
0,1,1193,5,978300760,2000-12-31 22:12:40,F,1,48067,K-12 student,One Flew Over the Cuckoo's Nest (1975),Drama,5.0
1,1,661,3,978302109,2000-12-31 22:35:09,F,1,48067,K-12 student,James and the Giant Peach (1996),Animation,5.0
2,1,661,3,978302109,2000-12-31 22:35:09,F,1,48067,K-12 student,James and the Giant Peach (1996),Children's,5.0
3,1,661,3,978302109,2000-12-31 22:35:09,F,1,48067,K-12 student,James and the Giant Peach (1996),Musical,5.0
4,1,914,3,978301968,2000-12-31 22:32:48,F,1,48067,K-12 student,My Fair Lady (1964),Musical,5.0
5,1,914,3,978301968,2000-12-31 22:32:48,F,1,48067,K-12 student,My Fair Lady (1964),Romance,5.0
6,1,3408,4,978300275,2000-12-31 22:04:35,F,1,48067,K-12 student,Erin Brockovich (2000),Drama,5.0
7,1,2355,5,978824291,2001-01-06 23:38:11,F,1,48067,K-12 student,"Bug's Life, A (1998)",Animation,5.0
8,1,2355,5,978824291,2001-01-06 23:38:11,F,1,48067,K-12 student,"Bug's Life, A (1998)",Children's,5.0
9,1,2355,5,978824291,2001-01-06 23:38:11,F,1,48067,K-12 student,"Bug's Life, A (1998)",Comedy,5.0
