In [1]:
#importing required libraries 
%matplotlib inline
import pandas as pd
import numpy as np
from scipy.stats import truncnorm
from ast import literal_eval

from sklearn.utils import shuffle
pd.options.display.max_columns = None

In [156]:
ratings = pd.read_csv('data/ratings_small.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [157]:
customers = pd.DataFrame({'userId':ratings['userId'].unique()})
customers = customers.sort_values('userId')

In [158]:
 c = ratings['userId'].value_counts()

In [159]:
def cal_avg_rating_user(row,col):
    uid = int(row[col])
    u_data = ratings[ratings[col] == uid]
    avg_rating = u_data['rating'].sum()/len(u_data)
    return avg_rating
    
def avg_rating_user (df,col,label):
    df[label] = df.apply(lambda row: cal_avg_rating_user(row,col), axis = 1)
    return df

In [160]:
customers = avg_rating_user (customers,'userId','avg_rating')

In [161]:
def cal_avg_rating(row,col):
    id_val = int(row[col])
    r_data = ratings[ratings[col] == id_val]
    avg_rating = r_data['rating'].sum()
    return avg_rating
    
def avg_rating (df,col,label):
    df[label] = df.apply(lambda row: cal_avg_rating(row,col), axis = 1)
    return df

In [162]:
items = pd.DataFrame({'movieId':ratings['movieId'].unique()})
items = items.sort_values('movieId')

In [163]:
items = avg_rating(items,'movieId','avg_rating_item')

In [164]:
def cal_quantity(row,col):
    id_val = int(row[col])
    r_data = ratings[ratings[col] == id_val]
    qua = len(r_data)
    return qua
    
def quantity (df,col,label):
    df[label] = df.apply(lambda row: cal_quantity(row,col), axis = 1)
    return df

In [165]:
items = quantity(items,'movieId','quantity_item')

In [166]:
def cal_likability(row,col):
    id_val = int(row[col])
    data = ratings[ratings[col] == id_val]
    like = 0;
    for i in range(0, len(data)):
        like += float(data.iloc[i].rating - customers[customers['userId'] == data.iloc[i].userId].avg_rating)
    #print('pass',uid)
    return like
    
def likability(df,col,label):
    df[label] = df.apply(lambda row: cal_likability(row,col), axis = 1)
    return df

In [167]:
items = likability(items,'movieId','likability')

In [168]:
items['avg_rating_item'] = items['avg_rating_item'].div(items['quantity_item'],axis='index')
items['likability'] = items['likability'].div(items['quantity_item'],axis='index')

In [169]:
items.head()

Unnamed: 0,movieId,avg_rating_item,quantity_item,likability
417,1,3.87247,247,0.225976
650,2,3.401869,107,-0.155981
319,3,3.161017,59,-0.421958
2084,4,2.384615,13,-1.248102
651,5,3.267857,56,-0.322901


In [170]:
customers = avg_rating(customers,'userId','avg_rating_customer')
customers = quantity(customers,'userId','quantity_customer')

In [171]:
def cal_seen_popularity(row):
    id_val = int(row['userId'])
    data = ratings[ratings['userId'] == id_val].movieId.unique()
    popular = 0;
    for i in range(0, len(data)):
        popular += float(items[items['movieId'] == data[i]].quantity_item)
    #print('pass',uid)
    return popular
    
def seen_popularity(df):
    df['seen_popularity'] = df.apply(lambda row: cal_seen_popularity(row), axis = 1)
    return df

In [172]:
customers = seen_popularity(customers)

In [173]:
def cal_seen_rating(row):
    id_val = int(row['userId'])
    data = ratings[ratings['userId'] == id_val].movieId.unique()
    popular = 0;
    for i in range(0, len(data)):
        popular += float(items[items['movieId'] == data[i]].avg_rating_item)
    #print('pass',uid)
    return popular
    
def seen_rating(df):
    df['seen_rating'] = df.apply(lambda row: cal_seen_rating(row), axis = 1)
    return df

In [174]:
customers = seen_rating(customers)

In [175]:
customers['avg_rating_customer'] = customers['avg_rating_customer'].div(customers['quantity_customer'],axis='index')
customers['seen_popularity'] = customers['seen_popularity'].div(customers['quantity_customer'],axis='index')
customers['seen_rating'] = customers['seen_rating'].div(customers['quantity_customer'],axis='index')

In [176]:
customers.head()

Unnamed: 0,userId,avg_rating,avg_rating_customer,quantity_customer,seen_popularity,seen_rating
0,1,2.55,2.55,20,45.55,3.562529
1,2,3.486842,3.486842,76,106.578947,3.536646
2,3,3.568627,3.568627,51,116.843137,3.716531
3,4,4.348039,4.348039,204,72.480392,3.610074
4,5,3.91,3.91,100,92.28,3.567149


In [177]:
# given the mean, standard deviation, lower and upper limits and the number of samples to be generated,
# returns 'size' number of rows sampled from a truncated normal distribution.
def get_truncated_normal(mean, sd, low, upp, size):
    return truncnorm((low - mean) / sd, (upp - mean) / sd, loc=mean, scale=sd).rvs(size=size)

In [178]:
def add_column(df,col_name, para,size, possible_values=None, categorical=True,):
    if possible_values is None:
        possible_values = [1, 0]

    if categorical:
        col = np.random.choice(possible_values, size=size, replace=True, p=para)
        
    # if numarical
    else:
        col = list(map(int, get_truncated_normal(para[0], 
                                                    para[1], 
                                                    para[2], 
                                                    para[3], 
                                                    size)))

    # add to all dataframes
    df[col_name] = col
    return df

In [179]:
# Gender
gender_label = ['male','female']
customers = add_column(customers,'gender',[0.53,0.47],len(customers),possible_values = gender_label)

In [180]:
# Age
customers = add_column(customers,'age',[47.7,18.2,18,40],len(customers),categorical=False)

In [181]:
item_copy = items.copy()

In [182]:
items = item_copy.copy()

In [183]:
m_df = pd.read_csv('data/movies_metadata.csv')

#getting genres
m_df['genres'] = m_df['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] 
                                                                       if isinstance(x, list) else [])
#getting year
m_df['year'] = pd.to_datetime(m_df['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] 
                                                                           if x != np.nan else np.nan)

  interactivity=interactivity, compiler=compiler, result=result)


In [184]:
m_df = m_df[:len(item_copy)]

In [185]:
items['title'] = m_df['original_title']
items['genres'] = m_df['genres']
items['year'] = m_df['year']

In [186]:
items.genres.head()#value_counts()

417     [Mystery, Crime, Drama, Thriller]
650              [Romance, Comedy, Drama]
319              [Comedy, Drama, Romance]
2084           [Adventure, Drama, Action]
651        [Adventure, Animation, Family]
Name: genres, dtype: object

In [187]:
#data1 = data.copy()
temp = items.apply(lambda row: pd.Series(row['genres']),axis=1).stack().reset_index(level=1, drop=True)
temp.name = 'genre'
items = items.drop('genres', axis=1).join(temp)
items = items[~items.index.duplicated(keep='first')]

In [188]:
# cost
items = add_column(items,'cost',[572.66,730,30,4232],len(items),categorical=False)

In [189]:
items.head()

Unnamed: 0,movieId,avg_rating_item,quantity_item,likability,title,year,genre,cost
0,31,3.178571,42,-0.277838,Toy Story,1995,Animation,149
1,1029,3.702381,42,0.156436,Jumanji,1995,Adventure,583
2,1061,3.545455,33,0.082347,Grumpier Old Men,1995,Romance,450
3,1129,3.3125,48,-0.160192,Waiting to Exhale,1995,Comedy,839
4,1172,4.26087,46,0.646038,Father of the Bride Part II,1995,Comedy,846


In [190]:
data = pd.merge(ratings, items, on='movieId', how='inner')

In [191]:
data = pd.merge(data,customers, on='userId', how='inner')

In [192]:
data.head()

Unnamed: 0,userId,movieId,rating,timestamp,avg_rating_item,quantity_item,likability,title,year,genre,cost,avg_rating,avg_rating_customer,quantity_customer,seen_popularity,seen_rating,gender,age
0,1,31,2.5,1260759144,3.178571,42,-0.277838,Toy Story,1995,Animation,149,2.55,2.55,20,45.55,3.562529,female,39
1,1,1029,3.0,1260759179,3.702381,42,0.156436,Jumanji,1995,Adventure,583,2.55,2.55,20,45.55,3.562529,female,39
2,1,1061,3.0,1260759182,3.545455,33,0.082347,Grumpier Old Men,1995,Romance,450,2.55,2.55,20,45.55,3.562529,female,39
3,1,1129,2.0,1260759185,3.3125,48,-0.160192,Waiting to Exhale,1995,Comedy,839,2.55,2.55,20,45.55,3.562529,female,39
4,1,1172,4.0,1260759205,4.26087,46,0.646038,Father of the Bride Part II,1995,Comedy,846,2.55,2.55,20,45.55,3.562529,female,39


In [193]:
col_list = ['userId','gender','age','movieId','title','genre','year','cost','avg_rating_item','quantity_item','likability',
            'avg_rating_customer','quantity_customer','seen_popularity','seen_rating','rating']

data = data[col_list]

In [194]:
data.to_csv('movie_data_final.csv',index=False)

In [195]:
customers.to_csv('customer.csv',index = False)

In [196]:
#items = items.drop('title',axis=1)
items.to_csv('items.csv',index = False)