In [1]:
import pandas as pd
import pickle
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import random
from scipy.stats import pearsonr
from statistics import mean

In [2]:
# Read data
ratings = pd.read_csv("/content/drive/My Drive/ratings.dat", header = None, sep = "::" )
ratings = ratings.rename(columns = {0:'userId', 1:'movieId', 2: 'rating', 3: 'timestamp'})
ratings['item_id'] = ratings['movieId'].astype("category").cat.codes

# Create mapping between cat codes and real movie id
item_list = ratings[['movieId', 'item_id']].drop_duplicates()
ratings = ratings.drop(['item_id'], axis=1)

  


In [3]:
# Load latent factors
embedding_matrix = pd.read_csv("/content/drive/My Drive/NeuMF.csv")
embedding_matrix.head()

item_latent_df = embedding_matrix.drop(['item_id','title','genres','year','genre_count','categories_movie_isin','year_group','year_genre','Label'], axis=1)
item_latent_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,movieId
0,-0.745756,-0.810328,-0.180637,-0.392364,0.420373,-0.453638,-0.824393,-0.6441,1.445741,-0.72554,-1.105817,-0.451083,-0.905751,-0.219297,-0.190617,0.22678,-0.661695,0.853211,-0.305958,-0.665272,0.479558,0.401969,0.0599,0.031561,0.006076,-0.995521,-0.198687,-0.547694,-0.484878,-0.416365,0.245675,-0.446353,0.143993,0.265401,0.928002,-0.513451,0.5017,1.328656,0.327769,-0.115712,-0.024511,0.307928,0.369817,0.438285,0.800622,0.235088,-0.529664,-0.123766,-0.73976,0.022672,-0.120049,0.301133,1.166055,-0.044849,-0.262418,-0.135846,0.978241,-0.843637,0.026694,0.508192,0.264034,-0.113671,0.664693,-0.306955,1
1,-0.375606,-0.429183,-0.441106,-0.762159,0.217342,-0.328485,-0.113656,-0.718548,1.511002,-0.561163,-1.040244,0.373338,-0.819342,-0.415236,-0.240466,0.252216,0.16711,-0.01181,-0.001892,0.009728,-0.21264,0.121976,-0.231064,0.351407,-0.569718,-0.534676,-0.086434,0.003924,-0.264484,-0.412184,0.491557,-0.007684,-0.062305,-0.175552,0.440789,0.197777,0.302935,0.276161,-0.352966,0.035115,0.090242,-0.199275,0.570234,-0.105711,-0.252642,0.232716,-0.117734,-0.361439,-0.509888,0.180903,0.668486,-0.177819,0.503767,-0.412506,-0.213499,-0.351064,-0.005325,-0.466856,-0.531038,0.28565,-0.043552,-0.390956,0.23311,-0.74115,2
2,-0.22723,-0.456836,0.157979,-0.068205,0.400518,-0.572706,0.257696,0.002668,0.533761,-0.513288,-0.596753,0.426836,-0.118945,0.110914,0.119412,0.035941,0.215133,0.035955,0.121595,-0.053975,0.177453,0.193951,0.147895,-0.085451,-0.611721,-0.808998,0.175548,0.119251,-0.458744,-0.161019,-0.006067,-0.200615,-0.415333,-0.237771,0.458441,0.036061,-0.291382,0.390405,-0.40333,-0.009877,0.122429,0.204094,0.370744,0.19237,0.111742,0.024023,-0.167029,-0.035981,0.281059,0.697248,0.360497,0.148832,0.250387,0.123672,0.179202,0.027835,-0.08756,0.120911,-0.229782,-0.481881,-0.145929,-0.067575,0.419661,-0.313982,3
3,-0.398684,-0.282128,0.012208,-0.560183,0.298808,-0.018783,-0.035054,0.133722,-0.232693,-0.51958,-0.385273,-0.063122,0.03458,0.242758,0.481521,0.574108,0.368946,0.428156,-0.778454,-0.015375,0.166613,0.075688,-0.26009,-0.361905,-0.312964,-0.205246,-0.251947,0.280492,-0.032194,-0.432292,-0.018221,-0.621794,-0.376753,0.097546,-0.202932,0.101295,-0.279971,0.204058,0.197893,0.275582,0.259211,0.366708,0.519585,0.25475,-0.304194,0.119505,0.374932,-0.3508,0.354307,-0.051929,-0.053794,0.347501,0.02399,0.341532,0.554989,0.100263,-0.005747,0.009932,-0.320261,-0.488145,-0.028105,0.298142,0.493528,0.016806,4
4,0.060892,-0.138244,-0.118853,-0.707611,0.431864,-0.670602,-0.168645,-0.151529,0.064511,-0.666976,-0.453228,-0.039383,-0.152126,0.197255,0.149095,0.453856,0.281051,0.225554,-0.151558,0.395273,0.052312,0.15499,-0.134952,-0.144897,-0.327003,-0.743648,0.124289,0.442096,-0.225206,-0.648629,-0.028192,-0.374055,-0.213524,0.085272,0.12871,0.560728,-0.113365,0.480774,-0.344052,-0.010517,0.383576,0.235916,0.421236,0.172347,-0.349929,0.021005,0.768824,-0.279633,0.120475,-0.003561,0.128235,0.009847,-0.053336,0.215822,0.190739,0.088055,0.255074,-0.040581,-0.541753,-0.42132,0.176482,0.263117,0.346135,-0.067497,5


In [4]:
# Load Movies dataset
rnames = ['movie_id','title','genres']
movies = pd.read_table('/content/drive/My Drive/movies.dat',sep='::',header=None, names=rnames)
movies['year'] = movies.title.str.extract("\((\d{4})\)", expand=True)
movies.year = pd.to_datetime(movies.year, format='%Y')
movies.year = movies.year.dt.year # As there are some NaN years, resulting type will be float (decimals)
movies['genre'] = movies.genres.str.split('|')
genre_count = [len(i) for i in movies.genre]
genre_count_df = pd.DataFrame(genre_count, columns = ['genre_count'])
movies = pd.concat([movies, genre_count_df], axis = 1)
movies = pd.merge(item_list, movies, left_on = 'movieId', right_on = 'movie_id', how = 'left')
movies = movies.drop(columns=['movie_id'])

# Create list of genres
mixed_genres = []
two_genres = []
unique_genres = []
for i in set(movies['genres']):
  if "|" in i:
    mixed_genres.append(i)
    count = i.count('|')
    if count == 1:
      two_genres.append(i)
  else:
    unique_genres.append(i)

movies.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,movieId,item_id,title,genres,year,genre,genre_count
0,1193,1104,One Flew Over the Cuckoo's Nest (1975),Drama,1975,[Drama],1
1,661,639,James and the Giant Peach (1996),Animation|Children's|Musical,1996,"[Animation, Children's, Musical]",3
2,914,853,My Fair Lady (1964),Musical|Romance,1964,"[Musical, Romance]",2
3,3408,3177,Erin Brockovich (2000),Drama,2000,[Drama],1
4,2355,2162,"Bug's Life, A (1998)",Animation|Children's|Comedy,1998,"[Animation, Children's, Comedy]",3


In [5]:
dataset_all = pd.merge(item_latent_df, movies, on = 'movieId').dropna()
dataset_all.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,movieId,item_id,title,genres,year,genre,genre_count
0,-0.745756,-0.810328,-0.180637,-0.392364,0.420373,-0.453638,-0.824393,-0.6441,1.445741,-0.72554,-1.105817,-0.451083,-0.905751,-0.219297,-0.190617,0.22678,-0.661695,0.853211,-0.305958,-0.665272,0.479558,0.401969,0.0599,0.031561,0.006076,-0.995521,-0.198687,-0.547694,-0.484878,-0.416365,0.245675,-0.446353,0.143993,0.265401,0.928002,-0.513451,0.5017,1.328656,0.327769,-0.115712,-0.024511,0.307928,0.369817,0.438285,0.800622,0.235088,-0.529664,-0.123766,-0.73976,0.022672,-0.120049,0.301133,1.166055,-0.044849,-0.262418,-0.135846,0.978241,-0.843637,0.026694,0.508192,0.264034,-0.113671,0.664693,-0.306955,1,0,Toy Story (1995),Animation|Children's|Comedy,1995,"[Animation, Children's, Comedy]",3
1,-0.375606,-0.429183,-0.441106,-0.762159,0.217342,-0.328485,-0.113656,-0.718548,1.511002,-0.561163,-1.040244,0.373338,-0.819342,-0.415236,-0.240466,0.252216,0.16711,-0.01181,-0.001892,0.009728,-0.21264,0.121976,-0.231064,0.351407,-0.569718,-0.534676,-0.086434,0.003924,-0.264484,-0.412184,0.491557,-0.007684,-0.062305,-0.175552,0.440789,0.197777,0.302935,0.276161,-0.352966,0.035115,0.090242,-0.199275,0.570234,-0.105711,-0.252642,0.232716,-0.117734,-0.361439,-0.509888,0.180903,0.668486,-0.177819,0.503767,-0.412506,-0.213499,-0.351064,-0.005325,-0.466856,-0.531038,0.28565,-0.043552,-0.390956,0.23311,-0.74115,2,1,Jumanji (1995),Adventure|Children's|Fantasy,1995,"[Adventure, Children's, Fantasy]",3
2,-0.22723,-0.456836,0.157979,-0.068205,0.400518,-0.572706,0.257696,0.002668,0.533761,-0.513288,-0.596753,0.426836,-0.118945,0.110914,0.119412,0.035941,0.215133,0.035955,0.121595,-0.053975,0.177453,0.193951,0.147895,-0.085451,-0.611721,-0.808998,0.175548,0.119251,-0.458744,-0.161019,-0.006067,-0.200615,-0.415333,-0.237771,0.458441,0.036061,-0.291382,0.390405,-0.40333,-0.009877,0.122429,0.204094,0.370744,0.19237,0.111742,0.024023,-0.167029,-0.035981,0.281059,0.697248,0.360497,0.148832,0.250387,0.123672,0.179202,0.027835,-0.08756,0.120911,-0.229782,-0.481881,-0.145929,-0.067575,0.419661,-0.313982,3,2,Grumpier Old Men (1995),Comedy|Romance,1995,"[Comedy, Romance]",2
3,-0.398684,-0.282128,0.012208,-0.560183,0.298808,-0.018783,-0.035054,0.133722,-0.232693,-0.51958,-0.385273,-0.063122,0.03458,0.242758,0.481521,0.574108,0.368946,0.428156,-0.778454,-0.015375,0.166613,0.075688,-0.26009,-0.361905,-0.312964,-0.205246,-0.251947,0.280492,-0.032194,-0.432292,-0.018221,-0.621794,-0.376753,0.097546,-0.202932,0.101295,-0.279971,0.204058,0.197893,0.275582,0.259211,0.366708,0.519585,0.25475,-0.304194,0.119505,0.374932,-0.3508,0.354307,-0.051929,-0.053794,0.347501,0.02399,0.341532,0.554989,0.100263,-0.005747,0.009932,-0.320261,-0.488145,-0.028105,0.298142,0.493528,0.016806,4,3,Waiting to Exhale (1995),Comedy|Drama,1995,"[Comedy, Drama]",2
4,0.060892,-0.138244,-0.118853,-0.707611,0.431864,-0.670602,-0.168645,-0.151529,0.064511,-0.666976,-0.453228,-0.039383,-0.152126,0.197255,0.149095,0.453856,0.281051,0.225554,-0.151558,0.395273,0.052312,0.15499,-0.134952,-0.144897,-0.327003,-0.743648,0.124289,0.442096,-0.225206,-0.648629,-0.028192,-0.374055,-0.213524,0.085272,0.12871,0.560728,-0.113365,0.480774,-0.344052,-0.010517,0.383576,0.235916,0.421236,0.172347,-0.349929,0.021005,0.768824,-0.279633,0.120475,-0.003561,0.128235,0.009847,-0.053336,0.215822,0.190739,0.088055,0.255074,-0.040581,-0.541753,-0.42132,0.176482,0.263117,0.346135,-0.067497,5,4,Father of the Bride Part II (1995),Comedy,1995,[Comedy],1


In [6]:
# Filter 2 genres
dataset_two = dataset_all.loc[dataset_all['genre_count'].isin([2])]

# Filter 1 genres
dataset = dataset_all.loc[dataset_all['genre_count'].isin([1])]

In [7]:
# Add column movies_order
ratings_movies = pd.merge(ratings, movies, on = 'movieId', how = 'left' )
ratings_movies['count'] = 1
ratings_movies = ratings_movies.sort_values(by=['userId','timestamp'])
ratings_movies['movies_order'] = ratings_movies.groupby(by=['userId'])['count'].transform(lambda x: x.cumsum())
ratings_movies = ratings_movies.drop(['count'], axis=1)
ratings_movies.head()

Unnamed: 0,userId,movieId,rating,timestamp,item_id,title,genres,year,genre,genre_count,movies_order
31,1,3186,4,978300019,2969,"Girl, Interrupted (1999)",Drama,1999,[Drama],1,1
22,1,1270,5,978300055,1178,Back to the Future (1985),Comedy|Sci-Fi,1985,"[Comedy, Sci-Fi]",2,2
27,1,1721,4,978300055,1574,Titanic (1997),Drama|Romance,1997,"[Drama, Romance]",2,3
37,1,1022,5,978300055,957,Cinderella (1950),Animation|Children's|Musical,1950,"[Animation, Children's, Musical]",3,4
24,1,2340,3,978300103,2147,Meet Joe Black (1998),Romance,1998,[Romance],1,5


In [8]:
def find_user_path(selected_movies, n_before, n_after):
    """ Find user's path within 21 frame

    variables :
    selected_movies item_id
    n_before         number of previous watch movies
    n_after          number of movie in target genres after watch stepping stone movies

    return:
    user_path_df    dataset of user's path fall in criteria
    """

    filter_movies = ratings_movies[ratings_movies['movieId']==selected_movies] # Selected movies
    selected_genre = list(ratings_movies[ratings_movies['movieId']==selected_movies]['genres'].drop_duplicates())[0] # movie's genre
    num_movies_dict = {}

    # Check when user watch selected movies
    order_dict = {}
    for i in list(filter_movies.index.values):
      order_dict[filter_movies['userId'][i]] = filter_movies['movies_order'][i]

    # Get sequence after selected movies
    seq_df = pd.DataFrame()
    for i in list(order_dict.keys()):
      df_temp = ratings_movies.loc[(ratings_movies['userId']==i) & ((order_dict[i]-10) <= ratings_movies['movies_order']) & ((order_dict[i]+10) >= ratings_movies['movies_order'])]
      seq_df = pd.concat([seq_df, df_temp])

    genres_seq = seq_df.groupby('userId')['genre'].apply(lambda x: x.tolist()) # Get sequence of genres by user

    # User stay in the same path or not
    user_path_df = pd.DataFrame()
    for user in list(genres_seq.index.values):
      check_previous = 0
      num_movies = 0
      for j in genres_seq[user][:n_before]: # check previous movies not in selected genre
        if selected_genre not in j:
          check_previous += 1
      for k in genres_seq[user][(n_before + 1):]: # check after watched stepping stone at least n_after movies
        if selected_genre in k:
          num_movies += 1
      if check_previous == n_before and num_movies >= n_after:
         user_path_df = pd.concat([user_path_df, seq_df[seq_df['userId']==user][['userId','movieId','title','genre','timestamp']]])
         num_movies_dict[(selected_movies,user)] = num_movies
         
    return user_path_df, num_movies_dict

In [9]:
def user_path_two_genres(selected_movies, n_before, n_after):
    filter_movies = ratings_movies[ratings_movies['movieId']==selected_movies] # Selected movies
    selected_genre = list(ratings_movies[ratings_movies['movieId']==selected_movies]['genres'].drop_duplicates())[0] # movie's genre
    genres_list = list(selected_genre.split('|'))

    num_movies_dict = {}

    # Check when user watch selected movies
    order_dict = {}
    for i in list(filter_movies.index.values):
      order_dict[filter_movies['userId'][i]] = filter_movies['movies_order'][i]

    # Get sequence after selected movies
    seq_df = pd.DataFrame()
    for i in list(order_dict.keys()):
      df_temp = ratings_movies.loc[(ratings_movies['userId']==i) & ((order_dict[i]-10) <= ratings_movies['movies_order']) & ((order_dict[i]+10) >= ratings_movies['movies_order'])]
      seq_df = pd.concat([seq_df, df_temp])

    genres_seq = seq_df.groupby('userId')['genre'].apply(lambda x: x.tolist()) # Get sequence of genres by user
    
    # User stay in the same path or not
    user_path_df = pd.DataFrame()
    for user in list(genres_seq.index.values):
      for g in range(len(genres_list)):
        count_1 = 0
        count_2 = 0
        num_movies = 0
        for j in genres_seq[user][:n_before]: 
          if genres_list[g] not in j: # check user never watched first genres before
            count_1 += 1
          if genres_list[(g+1)*abs(g-1)] in j: # check user watched second genres before
            count_2 += 1
        for k in genres_seq[user][(n_before + 1):]: # check after watched stepping stone at least n_after movies (ignore sequence)
          if genres_list[g] in k: # check user watched first genres after stepping tones
            num_movies += 1
        if count_1 == n_before and count_2 >= 1 and num_movies >= n_after:
          user_path_df = pd.concat([user_path_df, seq_df[seq_df['userId']==user][['userId','movieId','title','genre','timestamp']]])
          num_movies_dict[(selected_movies,user)] = num_movies
          
    return user_path_df, num_movies_dict

In [10]:
def distance_cal(genres, n_before, n_after, item_latent_df):
    """ Calculate distance from stepping stone to previoues n_before movies

    variables :
    genres           selected genres
    n_before         number of previous watch movies
    n_after          number of movie in target genres after watch stepping stone movies

    return:
    user_path_df    dataset of user's path fall in criteria
    distance_dict   key [movies,userid] value = [distance]
    num_movies_dict key [movies,userid] value = [number of movies in selected genres after watch stepping stone movie]
    """

    # Create list of all movies in selected genre
    if len(genres.split('|')) == 1:
      all = set(dataset[dataset['genres']==genres]['movieId'])
    else :
      all = set(dataset_two[dataset_two['genres']==genres]['movieId'])

    # Create data frame to contains all user path
    user_path_df = pd.DataFrame()

    # Create a dict to contains results distance
    distance_dict = {}
    num_movies_dict = {}

    # For each movies get user path
    for selected_movies in all:
      if len(genres.split('|')) == 1:
        temp_df, num_movies_dict_temp = find_user_path(selected_movies, n_before, n_after)
      else:
        temp_df, num_movies_dict_temp = user_path_two_genres(selected_movies, n_before, n_after)
      num_movies_dict.update(num_movies_dict_temp)
      user_path_df = pd.concat([user_path_df, temp_df])

      if len(temp_df) != 0:

        # Get embedding values
        temp_df = pd.merge(temp_df, item_latent_df, on = 'movieId', how = 'left') 

        # Genres sequence
        genres_seq = temp_df.groupby('userId')['genre'].apply(lambda x: x.tolist()) 

        # For each user calculate the avearge distance and count number of movies
        for user in list(temp_df['userId'].unique()):
          distance_list = []
          # Get array of stepping stone movies
          target_movie = temp_df[(temp_df['userId']==user) & (temp_df['movieId'] == selected_movies)][[f'{i}' for i in range(0, 64)]].to_numpy()
          # Calculate distance from steping stone movies to previous movies
          for pos in range(n_before):
            previous_movie = temp_df[temp_df['userId']==user][pos:pos+1][[f'{i}' for i in range(0, 64)]].to_numpy()
            distance_list.append(np.linalg.norm(target_movie - previous_movie))
          
          distance_dict[(selected_movies,user)] = distance_list

    return user_path_df, distance_dict, num_movies_dict

In [11]:
def cal_correlation(distance_dict, num_movies_dict, n_before):
    """ Calculate correlation between distance and number of movies """

    last_one = []
    last_three = []
    last_five = []
    last_all = []
    num_movies = []
    corr_list = []

    if len(distance_dict) < 2: # No user fall into changing path or only one user change (can't calculate correlation)
      corr_list = [['Not available','Not available','Not available','Not available']]

    else:
      for keys in distance_dict.keys():
        last_one.append(distance_dict[keys][n_before-1])
        last_three.append(np.nanmean(distance_dict[keys][n_before-3:n_before]))
        last_five.append(np.nanmean(distance_dict[keys][n_before-5:n_before]))
        last_all.append(np.nanmean(distance_dict[keys]))
        num_movies.append(num_movies_dict[keys])

      # calculate Pearson's correlation between avg distance and number of movies
      corr1, _ = pearsonr(last_one, num_movies)
      corr3, _ = pearsonr(last_three, num_movies)
      corr5, _ = pearsonr(last_five, num_movies)
      corrall, _ = pearsonr(last_all, num_movies)
      
      # Return all caculated correlation 
      corr_list.append([corr1,corr3,corr5,corrall])

    return corr_list

In [12]:
n_before = 10
n_after = 0

# Calculate correlation
user_path_diff = pd.DataFrame()
number_user = {}
correlation = {}

for g in unique_genres:
  print(g)
  temp_df, distance_dict, num_movies_dict = distance_cal(g, n_before, n_after, item_latent_df)
  user_path_diff = pd.concat([user_path_diff, temp_df])
  number_user[g] = len(temp_df['userId'].unique())
  correlation[g] = cal_correlation(distance_dict, num_movies_dict, n_before)

Adventure
Thriller
Children's
Musical
Fantasy
Film-Noir
Drama
Action
War
Crime
Romance
Animation
Western
Documentary
Sci-Fi
Comedy
Mystery
Horror


In [13]:
# Create a result dataframe
num_df = pd.DataFrame(number_user.items(), columns=['Genre', 'Number of users'])
corr_df = pd.DataFrame(correlation.items(), columns=['Genre', 'Corr'])
corr_df['Corr'] = corr_df['Corr'].str[0]
corr_df[['Last_one','Last_three','Last_five','all']] = pd.DataFrame(corr_df.Corr.tolist())
corr_df = corr_df.drop(columns=['Corr'])
corr_df = pd.merge(num_df, corr_df, on = 'Genre', how = 'left') 
corr_df

Unnamed: 0,Genre,Number of users,Last_one,Last_three,Last_five,all
0,Adventure,300,0.174052,0.197676,0.221554,0.176129
1,Thriller,1164,0.0305344,0.0582716,0.0608485,0.0761495
2,Children's,61,0.172981,0.130967,0.0398878,0.00317938
3,Musical,616,0.020767,-0.00626944,0.0113795,0.0267524
4,Fantasy,1,Not available,Not available,Not available,Not available
5,Film-Noir,216,0.111782,0.166195,0.139945,0.107153
6,Drama,1008,0.155696,0.177377,0.194909,0.204631
7,Action,417,-0.0269298,-0.019205,-0.0339114,-0.0108279
8,War,252,-0.0436395,-0.0532557,-0.0913876,-0.0973415
9,Crime,477,0.0781931,0.136055,0.174386,0.18008


In [14]:
# Calculate correlation
user_path_diff_2 = pd.DataFrame()
number_user_2 = {}
correlation_2 = {}

for g in two_genres:
  print(g)
  temp_df, distance_dict, num_movies_dict = distance_cal(g, n_before, n_after, item_latent_df)
  user_path_diff_2 = pd.concat([user_path_diff_2, temp_df])
  if len(temp_df) == 0:
    number_user_2[g] = 0
    correlation_2[g] = [['Not available','Not available','Not available','Not available']]
  else:
    number_user_2[g] = len(temp_df['userId'].unique())
    correlation_2[g] = cal_correlation(distance_dict, num_movies_dict, n_before)

Drama|Film-Noir
Horror|Mystery
Drama|War
Comedy|Crime
Children's|Drama
Horror|Romance
Crime|Horror
Animation|Sci-Fi
Action|War
Documentary|Musical
Drama|Romance
Mystery|Sci-Fi
Adventure|Fantasy
Fantasy|Sci-Fi
Comedy|Mystery
Animation|Children's
Comedy|Musical
Documentary|Drama
Action|Thriller
Adventure|Comedy
Children's|Fantasy




Adventure|Western
Crime|Film-Noir
Crime|Drama
Romance|Western
Comedy|War
Children's|Sci-Fi
Drama|Thriller
Film-Noir|Mystery
Children's|Horror
Comedy|Sci-Fi
Drama|Musical
Musical|Romance
Comedy|Drama
Adventure|Sci-Fi
Romance|War
Comedy|Horror
Adventure|Musical
Crime|Thriller
Adventure|Drama
Comedy|Western
Animation|Comedy
Action|Comedy
Film-Noir|Thriller
Horror|Sci-Fi
Mystery|Thriller
Crime|Mystery
Adventure|Romance
Animation|Mystery
Adventure|Thriller
Sci-Fi|War
Action|Western
Action|Crime
Animation|Musical
Adventure|War
Romance|Thriller
Musical|War
Action|Adventure
Action|Drama
Film-Noir|Sci-Fi
Horror|Thriller
Drama|Mystery
Action|Horror
Sci-Fi|Thriller
Documentary|War
Action|Sci-Fi
Children's|Comedy
Action|Romance
Film-Noir|Horror
Children's|Musical
Comedy|Thriller
Adventure|Children's
Drama|Western
Comedy|Romance
Drama|Horror
Drama|Sci-Fi
Drama|Fantasy
Action|Children's
Comedy|Documentary
Comedy|Fantasy


In [15]:
num_df_2 = pd.DataFrame(number_user_2.items(), columns=['Genre', 'Number of users'])
corr_df_2 = pd.DataFrame(correlation_2.items(), columns=['Genre', 'Corr'])
corr_df_2['Corr'] = corr_df_2['Corr'].str[0]
corr_df_2[['Last_one','Last_three','Last_five','all']] = pd.DataFrame(corr_df_2.Corr.tolist())
corr_df_2 = corr_df_2.drop(columns=['Corr'])
corr_df_2 = pd.merge(num_df_2, corr_df_2, on = 'Genre', how = 'left') 
corr_df_2

Unnamed: 0,Genre,Number of users,Last_one,Last_three,Last_five,all
0,Drama|Film-Noir,366,0.00435865,-0.0606245,-0.0725078,-0.0459234
1,Horror|Mystery,74,0.102673,0.162763,0.148282,0.105261
2,Drama|War,2359,0.0846205,0.0991443,0.0987927,0.103233
3,Comedy|Crime,1732,0.123688,0.12984,0.134061,0.142981
4,Children's|Drama,493,-0.00919581,0.000528152,0.0252212,0.0451905
...,...,...,...,...,...,...
75,Drama|Sci-Fi,1442,-0.0448937,-0.0357605,-0.0242805,-0.0361012
76,Drama|Fantasy,133,-0.0664244,0.018735,0.0569156,0.159367
77,Action|Children's,108,0.112908,0.0065169,-0.0497387,-0.0968441
78,Comedy|Documentary,760,0.0169801,0.0272856,0.0255408,0.0644046


In [16]:
corr_df_2.to_csv('/content/drive/My Drive/correlation.csv', index = False, header=True)