In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import itertools
import time
import statsmodels.api as sm
from sklearn import linear_model
from tqdm import tnrange, tqdm_notebook
from sklearn.metrics import mean_squared_error
from statsmodels.regression.quantile_regression import QuantReg

from sklearn.preprocessing import StandardScaler

import os

from sklearn.model_selection import train_test_split

In [2]:
# df=pd.read_csv('./ml-1m/movies.dat',sep='::')

In [3]:
rt = np.loadtxt('./ml-1m/ratings.dat',delimiter='::',dtype=np.int64)

#### - UserIDs  1 ~ 6040 
#### - MovieIDs 1 ~ 3952
#### - Ratings  5-star scale (whole-star ratings only)
#### - Timestamp is represented in seconds since the epoch as returned by time(2)

- rating mean

In [4]:
# rt[:,2].mean()

- How many users participated

In [5]:
# len(rt)

In [6]:
users = len(np.unique(rt[:,0]))
users

6040

#### - mean of rating for each user

##### using for 

In [7]:
# user_mean =[]
# for x in range(users):
#     rt_ = rt[:,0] == (x+1)
#     if x ==0:
#         print(rt_)
#     user = rt[rt_,2]
#     user_mean.append(user.mean())

In [8]:
# user_mean

In [9]:
# np.mean(user_mean)

- transform to Pandas 

In [10]:
rtp = pd.DataFrame(rt,columns=['usr_id','mv_id','star','timestamp'])

In [11]:
rtp['star_mean'] = rtp.groupby(['usr_id'])['star'].transform('mean')
rtp['star_std'] = rtp.groupby(['usr_id'])['star'].transform('std')

In [12]:
for qt in [0.0,0.25,0.5,0.75,1] :
    rtp['star_qt%d'%qt] = rtp.groupby(['usr_id'])['star'].transform(lambda x: x.quantile(q=qt))

#### integrate Data (movies.dat,ratings.dat,users.dat)
- "ratings.dat" UserID::MovieID::Rating::Timestamp
- "users.dat"   UserID::Gender::Age::Occupation::Zip-code
- "movies.dat"  MovieID::Title::Genres
> integrated data :  
UserID::UserID::Gender::Age::Occupation::Zip-code::
MovieID::MovieID::Title::Genres
::Rating
::Timestamp 

In [13]:
mv  = pd.read_csv('ml-1m/movies.dat',sep='::',names=['mov_id','title','genres'])
usr = pd.read_csv('ml-1m/users.dat' ,sep='::',names=['usr_id','gender','age','occupation','zip-code'])

  """Entry point for launching an IPython kernel.
  


In [14]:
usr = usr.set_index('usr_id')

In [15]:
mv  = mv.set_index('mov_id')

In [49]:
mv = mv.reset_index()
usr = usr.reset_index()

In [41]:
mv = mv.rename(columns={'mov_id':'mv_id'})

In [50]:
usr

Unnamed: 0,usr_id,gender,age,occupation,zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


In [21]:
rtp

Unnamed: 0,usr_id,mv_id,star,timestamp,star_mean,star_std,star_qt0,star_qt1
0,1,1193,5,978300760,4.188679,0.680967,5.0,5
1,1,661,3,978302109,4.188679,0.680967,5.0,5
2,1,914,3,978301968,4.188679,0.680967,5.0,5
3,1,3408,4,978300275,4.188679,0.680967,5.0,5
4,1,2355,5,978824291,4.188679,0.680967,5.0,5
...,...,...,...,...,...,...,...,...
1000204,6040,1091,1,956716541,3.577713,1.179719,4.0,5
1000205,6040,1094,5,956704887,3.577713,1.179719,4.0,5
1000206,6040,562,5,956704746,3.577713,1.179719,4.0,5
1000207,6040,1096,4,956715648,3.577713,1.179719,4.0,5


In [52]:
rtp_m = pd.merge(rtp,mv,how='inner',on='mv_id')

In [53]:
rtp_mu = pd.merge(rtp_m,usr,how='outer',on='usr_id')

Unnamed: 0,usr_id,mv_id,star,timestamp,star_mean,star_std,star_qt0,star_qt1,index,title,genres,gender,age,occupation,zip-code
0,1,1193,5,978300760,4.188679,0.680967,5.0,5,1176,One Flew Over the Cuckoo's Nest (1975),Drama,F,1,10,48067
1,1,661,3,978302109,4.188679,0.680967,5.0,5,655,James and the Giant Peach (1996),Animation|Children's|Musical,F,1,10,48067
2,1,914,3,978301968,4.188679,0.680967,5.0,5,902,My Fair Lady (1964),Musical|Romance,F,1,10,48067
3,1,3408,4,978300275,4.188679,0.680967,5.0,5,3339,Erin Brockovich (2000),Drama,F,1,10,48067
4,1,2355,5,978824291,4.188679,0.680967,5.0,5,2286,"Bug's Life, A (1998)",Animation|Children's|Comedy,F,1,10,48067
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000204,4211,3791,2,965319075,2.958333,1.122078,4.0,5,3722,Footloose (1984),Drama,M,45,5,77662
1000205,4211,3806,3,965319138,2.958333,1.122078,4.0,5,3737,MacKenna's Gold (1969),Western,M,45,5,77662
1000206,4211,3840,4,965319197,2.958333,1.122078,4.0,5,3770,Pumpkinhead (1988),Horror,M,45,5,77662
1000207,4211,3766,2,965319138,2.958333,1.122078,4.0,5,3697,Missing in Action (1984),Action|War,M,45,5,77662


In [17]:
usr_info = rtp['usr_id'].transform([
                        lambda x : usr.loc[x]['gender'] ,
                        lambda x : usr.loc[x]['age'] ,
                        lambda x : usr.loc[x]['occupation'] ,
                        lambda x : usr.loc[x]['zip-code']                        
                       ])
usr_info.columns = ['gender','age','occupation','zip-code']

KeyboardInterrupt: 

In [None]:
mv_info = rtp['mv_id'].transform([
                        lambda x : mv.loc[x]['title'] ,
                        lambda x : mv.loc[x]['genres']                
                       ])
usr_info.columns = ['gender','age','occupation','zip-code']
mv_info .columns = ['Title','Genres']

In [None]:
df = pd.concat([rtp,usr_info,mv_info],axis=1)

In [None]:
df

#### store integrated data

In [None]:
# df.to_csv('./ml-1m/integrated.csv')

In [None]:
# import pickle

# with open('integrated.pkl','wb') as f:
#     pickle.dump(df,f)

In [None]:
df.groupby(['usr_id']).get_group(3)['star'].plot()

In [None]:
df['star'].value_counts()

In [None]:
df['star'].unique()

In [None]:
import time 

In [None]:
dir(time.gmtime(df.iloc[0]['timestamp']))

In [None]:
time.gmtime(df[])

In [None]:
df.groupby(['usr_id']).get_group(2)['Genres'].unique()