## Step 1: Create a SparkSession with a SparkContext

In [1]:
import pyspark
spark = pyspark.sql.SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [2]:
spark

In [3]:
sc

## Step 2: Create Spark DataFrames

In [4]:
import pandas as pd

### Ratings Dataframe

In [5]:
#Notice this is a local file. There are computer clusters don't have local files so be careful.
rf = spark.read.json('data/ratings.json')

In [6]:
#look at schema
rf

DataFrame[movie_id: bigint, rating: bigint, timestamp: double, user_id: bigint]

In [7]:
rf.limit(5).show()

+--------+------+------------+-------+
|movie_id|rating|   timestamp|user_id|
+--------+------+------------+-------+
|     858|     4|9.56678732E8|   6040|
|    2384|     4|9.56678754E8|   6040|
|     593|     5|9.56678754E8|   6040|
|    1961|     4|9.56678777E8|   6040|
|    1419|     3|9.56678856E8|   6040|
+--------+------+------------+-------+



In [8]:
rf.limit(5).toPandas()

Unnamed: 0,movie_id,rating,timestamp,user_id
0,858,4,956678732.0,6040
1,2384,4,956678754.0,6040
2,593,5,956678754.0,6040
3,1961,4,956678777.0,6040
4,1419,3,956678856.0,6040


In [9]:
#Instead of loading this file in over again every time we want to look at it, we'll use
rf.persist()

DataFrame[movie_id: bigint, rating: bigint, timestamp: double, user_id: bigint]

In [10]:
rf.head()

Row(movie_id=858, rating=4, timestamp=956678732.0, user_id=6040)

In [11]:
drf = rf.toPandas()

In [12]:
drf.head()

Unnamed: 0,movie_id,rating,timestamp,user_id
0,858,4,956678732.0,6040
1,2384,4,956678754.0,6040
2,593,5,956678754.0,6040
3,1961,4,956678777.0,6040
4,1419,3,956678856.0,6040


In [13]:
drf = drf.drop('timestamp', axis=1)

In [14]:
drf.movie_id.value_counts()

2858    2619
260     2258
1196    2244
1210    2204
589     2069
480     2025
2028    2003
1270    1993
608     1942
2571    1937
1580    1931
593     1917
1198    1900
110     1877
2762    1839
2396    1819
1197    1765
527     1754
1617    1734
1097    1725
2628    1722
2997    1706
858     1688
1265    1680
318     1670
2716    1639
356     1631
296     1626
1240    1586
1       1559
        ... 
3295       1
3601       1
2510       1
3290       1
730        1
1470       1
758        1
1558       1
1118       1
2845       1
796        1
3353       1
601        1
792        1
790        1
1115       1
789        1
3164       1
1630       1
3607       1
607        1
3337       1
98         1
2484       1
2308       1
2584       1
3842       1
3687       1
2811       1
3890       1
Name: movie_id, Length: 3642, dtype: int64

It looks like we have ratings for 3642 movies.

In [15]:
drf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 719949 entries, 0 to 719948
Data columns (total 3 columns):
movie_id    719949 non-null int64
rating      719949 non-null int64
user_id     719949 non-null int64
dtypes: int64(3)
memory usage: 16.5 MB


In [16]:
drf.user_id.value_counts()

1680    1671
889     1384
4169    1300
4277    1255
3618    1198
1941    1183
1150    1175
5795    1159
4344    1145
4510    1114
4227    1091
3841    1083
4508    1054
3808    1048
5367    1006
3539     971
2063     963
5831     947
3224     923
4725     914
5643     913
3032     910
4064     898
2909     894
5954     887
1285     886
4447     878
1181     875
1980     871
3778     866
        ... 
1447       5
5484       5
5998       5
1547       5
4610       5
2775       5
1463       5
5291       5
918        5
3991       5
4418       5
4744       5
750        5
2057       5
4467       5
1252       5
3152       4
1779       4
4528       4
4463       4
827        4
2991       4
988        4
1256       3
3288       3
1551       3
4273       3
3038       2
3326       1
3459       1
Name: user_id, Length: 5400, dtype: int64

It looks like we 5400 unique users.

### Requests Dataframe. This is the Dataframe of movies users have not seen, which we will populate with recommendations.

In [17]:
#Notice this is a local file. There are computer clusters don't have local files so be careful.
rq = spark.read.json('data/requests.json')

In [18]:
#look at schema
rq

DataFrame[movie_id: bigint, rating: double, timestamp: double, user_id: bigint]

In [19]:
rq.limit(5).show()

+--------+------+------------+-------+
|movie_id|rating|   timestamp|user_id|
+--------+------+------------+-------+
|    2019|   NaN|9.56678777E8|   6040|
|     759|   NaN|9.56679248E8|   6040|
|    2858|   NaN|9.56679275E8|   6040|
|     246|   NaN|9.56679413E8|   6040|
|    1617|   NaN|9.56679473E8|   6040|
+--------+------+------------+-------+



In [20]:
rq.limit(5).toPandas()

Unnamed: 0,movie_id,rating,timestamp,user_id
0,2019,,956678777.0,6040
1,759,,956679248.0,6040
2,2858,,956679275.0,6040
3,246,,956679413.0,6040
4,1617,,956679473.0,6040


In [21]:
#Instead of loading this file in over again every time we want to look at it, we'll use
rq.persist()

DataFrame[movie_id: bigint, rating: double, timestamp: double, user_id: bigint]

In [22]:
rq.head()

Row(movie_id=2019, rating=nan, timestamp=956678777.0, user_id=6040)

In [23]:
drq = rq.toPandas()

In [24]:
drq.head()

Unnamed: 0,movie_id,rating,timestamp,user_id
0,2019,,956678777.0,6040
1,759,,956679248.0,6040
2,2858,,956679275.0,6040
3,246,,956679413.0,6040
4,1617,,956679473.0,6040


In [25]:
drq = drq.drop('timestamp', axis=1)

In [26]:
drq.movie_id.value_counts()

2858    809
3578    759
1196    746
260     733
1210    679
593     661
2571    653
2028    650
480     647
2762    620
1198    614
1580    607
1265    598
1270    590
589     580
3751    577
608     571
110     566
356     563
318     557
1617    554
1197    553
2396    550
527     550
296     545
1097    544
2716    542
3793    540
858     535
2997    535
       ... 
1324      1
865       1
775       1
744       1
3542      1
981       1
980       1
1908      1
1039      1
33        1
3209      1
1160      1
2592      1
712       1
3460      1
1154      1
3533      1
717       1
2850      1
2172      1
1316      1
634       1
1145      1
889       1
120       1
3280      1
2343      1
887       1
2258      1
402       1
Name: movie_id, Length: 3564, dtype: int64

It looks like we have 3540 unique movies.

In [27]:
drq.user_id.value_counts()

424     1226
549     1152
1088    1074
1448    1025
524     1016
4169    1014
1447     985
550      967
678      945
352      870
531      867
482      825
195      822
216      794
302      788
1733     777
329      764
245      764
1912     746
509      723
319      721
5394     697
53       684
543      671
202      670
1181     646
1425     644
411      635
1266     633
148      624
        ... 
1126       1
3897       1
1089       1
2081       1
3282       1
4264       1
1801       1
3541       1
2260       1
2516       1
1237       1
4941       1
4672       1
3446       1
835        1
1911       1
2568       1
799        1
5589       1
5206       1
2007       1
5898       1
1703       1
4570       1
867        1
2470       1
5796       1
986        1
5336       1
4652       1
Name: user_id, Length: 5970, dtype: int64

It looks like we have 5970 users to whom we want to recommend movies.

## Users Dataframe

In [28]:
usdf_headers = ['user_id', 'gender', 'age', 'occupation', 'zip_code']

In [29]:
usdf = pd.read_csv('data/users.dat', sep='::', names=usdf_headers)

  """Entry point for launching an IPython kernel.


In [30]:
usdf.head()

Unnamed: 0,user_id,gender,age,occupation,zip_code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [31]:
# zip codes aren't doing much for us so we'll drop them
usdf = usdf.drop('zip_code', axis=1)

In [32]:
usdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6040 entries, 0 to 6039
Data columns (total 4 columns):
user_id       6040 non-null int64
gender        6040 non-null object
age           6040 non-null int64
occupation    6040 non-null int64
dtypes: int64(3), object(1)
memory usage: 188.8+ KB


## Movies DataFrame

In [33]:
mvdf_headers = ['title', 'genres']

In [34]:
mvdf = pd.read_csv('data/movies.dat', sep='::', names=mvdf_headers, index_col=0)

  """Entry point for launching an IPython kernel.


In [35]:
mvdf.head()

Unnamed: 0,title,genres
1,Toy Story (1995),Animation|Children's|Comedy
2,Jumanji (1995),Adventure|Children's|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama
5,Father of the Bride Part II (1995),Comedy


In [36]:
mvdf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3883 entries, 1 to 3952
Data columns (total 2 columns):
title     3883 non-null object
genres    3883 non-null object
dtypes: object(2)
memory usage: 91.0+ KB


In [37]:
mvdf['genres'] = mvdf['genres'].apply(lambda x: x.split('|'))
mvdf.head()

Unnamed: 0,title,genres
1,Toy Story (1995),"[Animation, Children's, Comedy]"
2,Jumanji (1995),"[Adventure, Children's, Fantasy]"
3,Grumpier Old Men (1995),"[Comedy, Romance]"
4,Waiting to Exhale (1995),"[Comedy, Drama]"
5,Father of the Bride Part II (1995),[Comedy]


In [38]:
all_genres = set([item for movie in mvdf['genres'] for item in movie])

## Metadata Dataframe

In [39]:
# metadf = pd.read_csv('data/movies_metadata.csv')

In [40]:
# metadf.head()

In [41]:
#df['Capital'] = df['Country'].map(country_capital)
# metadf.info()

In [42]:
# print(metadf.genres[1])
# print(type(metadf.genres[1]))

### OneHotEncoding

In [43]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import KMeans

In [44]:
def ohe_columns(series, name):
    ohe = OneHotEncoder(categories='auto')
    ohe.fit(series)
    cols = ohe.get_feature_names(name)
    ohe = ohe.transform(series)
    final_df = pd.DataFrame(ohe.toarray(), columns=cols)
    return final_df

In [45]:
# OHE the user cols
my_cols = ['gender', 'age', 'occupation']

ohe_multi = OneHotEncoder(categories='auto')
ohe_multi.fit(usdf[my_cols])
ohe_mat = ohe_multi.transform(usdf[my_cols])


## Clustering Demographics

In [46]:
from sklearn.cluster import KMeans

In [47]:
k_clusters = KMeans(n_clusters=8, random_state=42)

In [48]:
# cluster user demographics
k_clusters.fit(ohe_mat)
demo_preds = k_clusters.predict(ohe_mat)

In [49]:
demo_preds.shape

(6040,)

In [50]:
#User dataframe with cluster
dcdf = usdf
dcdf['cluster'] = demo_preds
dcdf.head()

Unnamed: 0,user_id,gender,age,occupation,cluster
0,1,F,1,10,4
1,2,M,56,16,0
2,3,M,25,15,2
3,4,M,45,7,1
4,5,M,25,20,2


### Now let's create the ratings dataframe with the `user_id`'s associated cluster

In [51]:
cluster_dict = {}
for k, v in zip(usdf['user_id'].tolist(), usdf['cluster'].tolist()):
    cluster_dict[k] = v

In [52]:
#ratings dataframe with user cluster label
mvcf = drf
mvcf['cluster'] = mvcf['user_id'].apply(lambda x: cluster_dict[x])

In [53]:
mvcf.tail()

Unnamed: 0,movie_id,rating,user_id,cluster
719944,3198,4,635,0
719945,2966,4,635,0
719946,2303,4,635,0
719947,802,4,1875,5
719948,892,4,1875,5


### We'll export that to a csv

In [54]:
mvcf.to_csv('data/movie_rating_user_clusters')

## Now we build functions for prediction and recommendation

In [55]:
#How do we find a user's cluster?
cluster6040 = mvcf.loc[mvcf.user_id == 6040]['cluster'].mean()

In [56]:
cluster6040

2.0

In [57]:
def get_cluster(df, user_id):
    label = df.loc[df.user_id == user_id]['cluster'].mean()
    return label

In [58]:
get_cluster(mvcf, 1088)

4.0

In [59]:
mvcf.loc[mvcf.movie_id == 2303]['rating'].mean()

3.8949579831932772

In [60]:
# def get_avg_cluster_rating(df, movie_id, cluster):
#     '''check that movie_id has been rated. If not, return average rating of that cluster'''
#     if movie_id not in df.movie_id.values:
#         clustered_avg = df.loc[df.cluster == cluster]['rating'].mean() 
#     '''check that movie has been rated by members of that cluster. If not, return average rating of that movie'''
#     rows = df.loc[(df.movie_id == movie_id) & (df.cluster == cluster)]
#     if len(rows) == 0:
#         clustered_avg = df.loc[df.movie_id == movie_id]['rating'].mean() 
#     else:
#         clustered_avg = rows['rating'].mean()
#     return clustered_avg

In [61]:
def get_avg_cluster_rating(df, movie_id, cluster):
    rows = df.loc[(df.movie_id == movie_id) & (df.cluster == cluster)]
    clustered_avg = rows['rating'].mean()
    return clustered_avg

In [62]:
#test this out
get_avg_cluster_rating(mvcf, 2303, cluster=1)

4.133333333333334

In [63]:
#test this out with Toy Story, which is not in `mvcf`
get_avg_cluster_rating(mvcf, 10194, 1)

nan

In [64]:
mvcf.loc[mvcf.cluster == 1]['rating'].mean() 

3.623765259267555

In [65]:
#John's functions to input user and cluster biases
def user_bias(df, user_id):
    return  df.loc[df['user_id'] == user_id, 'rating'].mean() - df['rating'].mean()

def item_bias(df, movie_id):
    return  df.loc[df['movie_id'] == movie_id, 'rating'].mean() - df['rating'].mean()

In [66]:
user_bias(mvcf, 1088)

-0.36498774055319094

In [67]:
item_bias(mvcf, 2303)

0.30448004656165484

In [68]:
mrc = pd.read_csv('data/movie_rating_user_clusters')
def tara_get_cold_start(user_id, movie_id, df):
    '''Check that movie is in the dataframe. If not, return average reviews for that user'''
    if movie_id not in df.movie_id.values:
        predicted_rating =  df.loc[df.user_id == user_id]['rating'].mean()      
    '''Identify if user_id is clustered. If not, return average reviews for that movie'''
    if user_id not in df.user_id.values:
        predicted_rating = df.loc[df.movie_id == movie_id]['rating'].mean()
    else:
        '''Find cluster'''
        user_cluster = get_cluster(df, user_id)
        '''Find avg cluster rating'''
        cluster_rating = get_avg_cluster_rating(df, movie_id, user_cluster)
        '''Weigh biases'''
        ubias = user_bias(df, user_id)
        ibias = item_bias(df, movie_id)
        predicted_rating = cluster_rating + ubias + ibias
    return predicted_rating
    

In [69]:
def cold_start_preds_column(user_id, movie_id):
    mrc = pd.read_csv('data/movie_rating_user_clusters')
    '''load requests json and create dataframe'''
    rq = spark.read.json('data/requests.json')
    rq.persist
    drq = rq.toPandas()
    drq = drq.drop('timestamp', axis=1)
    '''attach cluster information'''
    
    

In [70]:
mrc.head()

Unnamed: 0.1,Unnamed: 0,movie_id,rating,user_id,cluster
0,0,858,4,6040,2
1,1,2384,4,6040,2
2,2,593,5,6040,2
3,3,1961,4,6040,2
4,4,1419,3,6040,2


In [71]:
cluster_dict = {}
for k, v in zip(usdf['user_id'].tolist(), usdf['cluster'].tolist()):
    cluster_dict[k] = v

In [79]:
pq = drq

In [80]:
pq['cluster'] = pq['user_id'].apply(lambda x: cluster_dict[x])
pq.head()

Unnamed: 0,movie_id,rating,user_id,cluster
0,2019,,6040,2
1,759,,6040,2
2,2858,,6040,2
3,246,,6040,2
4,1617,,6040,2


In [81]:
pq['pred'] = pq[]

SyntaxError: invalid syntax (<ipython-input-81-725793815878>, line 1)

## Export clustered requests to csv

In [103]:
# pq.to_csv('data/requests_clusters')

In [None]:
def cold_preds_column(df):
    df = pd.read_csv('data/requests_clusters')
    df['pred'] = 

In [74]:
# test this with unrated movie
tara_get_cold_start(user_id=6040, movie_id=2019, df=mrc)

5.547255568910275

In [75]:
# test this with rated movie. Should be 4
tara_get_cold_start(6040, 858, df=mrc)

5.4660353463368825

In [76]:
mvcf.loc[(mvcf.user_id == 6040) & (mvcf.movie_id == 858)]['rating']

0    4
Name: rating, dtype: int64

In [77]:
mvcf.loc[mvcf.user_id == 6040]['rating'].mean()

3.5161290322580645

In [78]:
mvcf.iloc[2]

movie_id     593
rating         5
user_id     6040
cluster        2
Name: 2, dtype: int64

## Those functions are working. Just for fun we'll create a dataframe of requests with predicted values.

In [78]:
pq = drq

In [79]:
# pq['pred_rating'] = pq.apply(tara_get_cold_start(pq['user_id'], pq['movie_id']), axis=0, result_type='broadcast')
# pq.head()

In [87]:
def rate_that(row):
    return tara_get_cold_start(row['user_id'], row['movie_id'], mrc)

In [85]:
pq.head()

Unnamed: 0,movie_id,rating,user_id
0,2019,,6040
1,759,,6040
2,2858,,6040
3,246,,6040
4,1617,,6040


In [93]:
pq['preds'] = pq.apply(rate_that, axis=1)

KeyboardInterrupt: 