In [1]:
# %load Extract_MovieLens_Data.py


# # This notebook is to extract data from Movie Lens
# * The data contents are explained in http://files.grouplens.org/papers/ml-1m-README.txt
# 
# ## users.dat
# 
# UserID::Gender::Age::Occupation::Zip-code
# - Gender is denoted by a "M" for male and "F" for female
# - Age is chosen from the following ranges:
# 
# 	*  1:  "Under 18"
# 	* 18:  "18-24"
# 	* 25:  "25-34"
# 	* 35:  "35-44"
# 	* 45:  "45-49"
# 	* 50:  "50-55"
# 	* 56:  "56+"
# 
# - Occupation is chosen from the following choices:
# 
# 	*  0:  "other" or not specified
# 	*  1:  "academic/educator"
# 	*  2:  "artist"
# 	*  3:  "clerical/admin"
# 	*  4:  "college/grad student"
# 	*  5:  "customer service"
# 	*  6:  "doctor/health care"
# 	*  7:  "executive/managerial"
# 	*  8:  "farmer"
# 	*  9:  "homemaker"
# 	* 10:  "K-12 student"
# 	* 11:  "lawyer"
# 	* 12:  "programmer"
# 	* 13:  "retired"
# 	* 14:  "sales/marketing"
# 	* 15:  "scientist"
# 	* 16:  "self-employed"
# 	* 17:  "technician/engineer"
# 	* 18:  "tradesman/craftsman"
# 	* 19:  "unemployed"
# 	* 20:  "writer"
# 
# ## movies.dat
# MovieID::Title::Genres
# 
# ## ratings.dat
# UserID::MovieID::Rating::Timestamp

# In[1]:

import pandas as pd

# In[3]:

users_df = pd.read_csv("./ml-1m/users.dat"
                    , sep='::'
                    , header=None
                    , names=["UserID", "Gender", "Age", "Occupation", "Zip-code"])
ocupation_codes = {'ocupation_code': [x for x in range(21)]
                   , 'Occupation_name': ["other or not specified", "academic/educator", "artist"
                                  , "clerical/admin", "college/grad student", "customer service"
                                  , "doctor/health care", "executive/managerial", "farmer"
                                  , "homemaker", "K-12 student", "lawyer", "programmer", "retired"
                                  , "sales/marketing" ,"scientist", "self-employed", "technician/engineer"
                                  , "tradesman/craftsman", "unemployed", "writer"]
                  }
ocupation_codes = pd.DataFrame(ocupation_codes)
users_df = users_df.merge(ocupation_codes, left_on=["Occupation"], right_on=["ocupation_code"], how='left')
users_df = users_df.drop(["Occupation", "ocupation_code"], axis=1).rename(columns={'Occupation_name': 'Occupation'})


# In[4]:

movies_org_df = pd.read_csv("./ml-1m/movies.dat"
                            , sep='::'
                            , header=None
                            , names=["MovieID", "Title", "Genres"])
rows = []
for _, row in movies_org_df.iterrows():
    for gen in row.Genres.split('|'):
        rows.append([row['MovieID'], row['Title'], gen])
movies_df = pd.DataFrame(rows, columns=movies_org_df.columns)


# In[5]:

ratings_df = pd.read_csv("./ml-1m/ratings.dat"
                         , sep='::'
                         , header=None
                         , names=["UserID", "MovieID", "Rating", "Timestamp"])
ratings_df['rating_dt'] = pd.to_datetime(ratings_df['Timestamp'],unit='s')



In [2]:
print(users_df.columns)
print(movies_df.columns)
print(ratings_df.columns)

Index(['UserID', 'Gender', 'Age', 'Zip-code', 'Occupation'], dtype='object')
Index(['MovieID', 'Title', 'Genres'], dtype='object')
Index(['UserID', 'MovieID', 'Rating', 'Timestamp', 'rating_dt'], dtype='object')


In [37]:
df = ratings_df.merge(users_df, on=['UserID'], how='left').merge(movies_df, on=['MovieID'], how='left')

In [4]:
df.columns

Index(['UserID', 'MovieID', 'Rating', 'Timestamp', 'rating_dt', 'Gender',
       'Age', 'Zip-code', 'Occupation', 'Title', 'Genres'],
      dtype='object')

In [5]:
import datetime as dt
import numpy as np

In [6]:
ratings_2001_Q3 = df[(df.rating_dt < dt.datetime(2000, 10, 1)) & (df.rating_dt >= dt.datetime(2000, 7, 1))]

In [7]:
ratings_2001_Q3[["Rating_mean", "Age_mean"]] = ratings_2001_Q3.groupby(["Genres", "Gender", "Occupation"])[['Rating', 'Age']] \
.transform(np.mean).rename(columns={
    "Rating": "Rating_mean"
    , "Age": "Age_mean"
})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


In [None]:
a = ratings_2001_Q3.groupby(["Genres", "Gender", "Occupation"])[['Rating', 'Age']].agg([np.mean, np.std])
a = a.reset_index()
# a.columns = a.columns.get_level_values(0)
a.columns.values
a.columns = [x[:-1] if x.endswith('_') else x for x in ['_'.join(x) for x in a.columns.values]]
a

In [25]:
#df["Rating_mean"] = df.groupby(["Occupation", "Genres"])['Rating'].transform(np.mean)
#df.sort_values(by="Rating_mean", ascending=False).groupby(["Occupation"]).head(5)
df.groupby(["Occupation", "Genres"])['Rating'].agg(np.mean).reset_index().rename(columns={"Rating": "Rating_avg"}).sort_values(by="Rating_avg", ascending=False).groupby(["Occupation"]).head(5).sort_values(by=["Occupation", "Rating_avg"])

Unnamed: 0,Occupation,Genres,Rating_avg
12,K-12 student,Mystery,3.636612
5,K-12 student,Crime,3.687085
7,K-12 student,Drama,3.782167
16,K-12 student,War,3.880144
9,K-12 student,Film-Noir,4.212766
29,academic/educator,Musical,3.701586
25,academic/educator,Drama,3.754293
34,academic/educator,War,3.882950
24,academic/educator,Documentary,3.984887
27,academic/educator,Film-Noir,4.082613


In [43]:
# Find top 5 ratings for each occupation
#df = df.groupby(['Occupation', 'Genres']).agg({'Rating': np.mean}).reset_index()
df.assign(Rating_avg=df.groupby(['Occupation', 'Genres'])[['Rating']].transform(np.mean))[['Occupation', 'Genres', 'Rating_avg']] \
.drop_duplicates() \
.sort_values(by="Rating_avg", ascending=False) \
.groupby(["Occupation"]).head(5).sort_values(by=["Occupation", "Rating_avg"])

Unnamed: 0,Occupation,Genres,Rating_avg
4960,K-12 student,Mystery,3.636612
91,K-12 student,Crime,3.687085
0,K-12 student,Drama,3.782167
46,K-12 student,War,3.880144
4953,K-12 student,Film-Noir,4.212766
1747,academic/educator,Musical,3.701586
1120,academic/educator,Drama,3.754293
1145,academic/educator,War,3.882950
2130,academic/educator,Documentary,3.984887
1708,academic/educator,Film-Noir,4.082613
