In [1]:
import pandas as pd
import pickle
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import random
from scipy.stats import pearsonr
from statistics import mean

In [2]:
# Read data
ratings = pd.read_csv("/content/drive/My Drive/ratings.dat", header = None, sep = "::" )
ratings = ratings.rename(columns = {0:'userId', 1:'movieId', 2: 'rating', 3: 'timestamp'})
ratings['item_id'] = ratings['movieId'].astype("category").cat.codes

# Create mapping between cat codes and real movie id
item_list = ratings[['movieId', 'item_id']].drop_duplicates()
ratings = ratings.drop(['item_id'], axis=1)

  


In [3]:
# Load latent factors
with open("/content/drive/My Drive/item_latent.pickle", 'rb') as item_latent: 
  als_item_latent = pickle.load(item_latent)
with open("/content/drive/My Drive/user_latent.pickle", 'rb') as user_latent:
  als_user_latent = pickle.load(user_latent)

In [4]:
# Load Movies dataset
rnames = ['movie_id','title','genres']
movies = pd.read_table('/content/drive/My Drive/movies.dat',sep='::',header=None, names=rnames)
movies['year'] = movies.title.str.extract("\((\d{4})\)", expand=True)
movies.year = pd.to_datetime(movies.year, format='%Y')
movies.year = movies.year.dt.year # As there are some NaN years, resulting type will be float (decimals)
movies['genre'] = movies.genres.str.split('|')
genre_count = [len(i) for i in movies.genre]
genre_count_df = pd.DataFrame(genre_count, columns = ['genre_count'])
movies = pd.concat([movies, genre_count_df], axis = 1)
movies = pd.merge(item_list, movies, left_on = 'movieId', right_on = 'movie_id', how = 'left')
movies = movies.drop(columns=['movie_id'])

# Create list of genres
mixed_genres = []
two_genres = []
unique_genres = []
for i in set(movies['genres']):
  if "|" in i:
    mixed_genres.append(i)
    count = i.count('|')
    if count == 1:
      two_genres.append(i)
  else:
    unique_genres.append(i)

movies.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,movieId,item_id,title,genres,year,genre,genre_count
0,1193,1104,One Flew Over the Cuckoo's Nest (1975),Drama,1975,[Drama],1
1,661,639,James and the Giant Peach (1996),Animation|Children's|Musical,1996,"[Animation, Children's, Musical]",3
2,914,853,My Fair Lady (1964),Musical|Romance,1964,"[Musical, Romance]",2
3,3408,3177,Erin Brockovich (2000),Drama,2000,[Drama],1
4,2355,2162,"Bug's Life, A (1998)",Animation|Children's|Comedy,1998,"[Animation, Children's, Comedy]",3


In [5]:
# Create latent dataset
item_latent_df = pd.DataFrame(als_item_latent)
item_latent_df['item_id'] = item_latent_df.index
item_latent_df = pd.merge(item_latent_df, item_list, on = 'item_id').dropna()
item_latent_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,item_id,movieId
0,0.046745,0.004665,0.041605,-0.045153,-0.006969,0.049296,-0.011195,-0.001268,-0.014303,0.026581,-0.034755,-0.019688,0.003687,-0.002150,0.022543,0.007811,0.025419,-0.015451,0.022414,0.007889,0.037029,0.024086,0.006834,0.000108,0.027037,0.041853,-0.019126,-0.005141,0.014016,-0.016257,-0.007346,-0.012583,-0.024129,-0.002330,0.014767,-0.002599,0.013485,0.013500,0.010503,0.015153,0,1
1,-0.000197,0.017216,-0.001482,0.023131,-0.011494,0.022521,0.014670,-0.006329,-0.013746,0.035266,-0.013890,0.027044,0.008546,0.017434,-0.023165,0.013478,0.033705,-0.004714,0.011139,-0.022652,0.003465,0.016279,-0.025399,0.018919,0.012230,0.021709,-0.010909,0.021161,0.013243,0.050482,0.002820,0.005104,0.010459,0.009120,-0.003430,0.034801,0.005748,-0.018434,0.003431,0.014474,1,2
2,-0.011725,0.026086,0.028532,0.003159,0.007351,-0.007402,0.012995,0.017656,0.021652,0.000577,0.004887,-0.002046,0.023741,0.008782,0.004038,-0.031399,0.027487,0.023028,-0.015516,-0.002818,0.026736,-0.000086,-0.002660,-0.002150,0.017108,-0.000832,0.024792,0.003460,0.012006,0.016066,0.008950,0.010968,0.008148,0.017594,-0.010370,0.021666,0.006869,-0.009042,0.009412,-0.007008,2,3
3,0.011892,0.020287,0.001064,-0.003999,0.010913,0.011460,-0.008980,-0.004667,0.015053,-0.009862,0.004724,0.010715,0.003834,-0.002608,-0.006101,-0.011693,0.001903,0.026781,0.008089,-0.009439,0.008117,-0.003225,0.013765,0.005577,0.008990,0.002148,0.016476,-0.001379,0.003906,0.014562,0.011203,0.010161,0.005269,0.002876,0.007479,0.000672,0.012229,-0.005918,0.010025,0.009985,3,4
4,-0.006257,0.020105,0.021775,-0.005948,-0.001758,0.018903,-0.000049,0.010384,0.009208,-0.002559,-0.000626,0.020402,-0.000748,0.007940,0.012041,-0.007600,0.004883,0.025656,-0.003049,-0.004923,0.009676,-0.003921,0.002669,0.010100,0.028021,-0.006140,0.025830,0.003281,-0.006023,0.015592,0.004909,0.016519,-0.004503,0.001049,0.011211,0.004705,0.017255,-0.016907,0.003034,-0.005289,4,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3701,0.004264,-0.022085,0.045792,0.023598,0.031933,-0.020370,0.011070,-0.030361,0.030560,0.000345,-0.011246,-0.016911,-0.012760,-0.006792,0.004560,-0.007581,0.059154,-0.000905,-0.011861,0.028543,-0.017724,0.037803,0.011529,0.002558,0.011069,0.014177,0.014316,0.023860,0.031550,0.020560,-0.016755,-0.012659,0.033473,-0.039665,0.035979,0.029890,0.029220,0.010647,-0.016044,-0.016041,3701,3948
3702,0.012336,-0.027397,0.014222,0.016543,0.026509,-0.017530,0.007730,-0.001837,0.028080,0.009089,0.000090,-0.002353,-0.011142,-0.009681,-0.015289,-0.003455,0.026566,0.003606,-0.001011,0.022174,-0.005210,0.002247,-0.007505,0.007276,0.008129,0.019681,0.022333,0.031267,0.003027,0.009742,-0.008979,0.017477,0.016499,-0.026560,0.016633,0.015255,0.004937,0.031617,-0.006910,0.005375,3702,3949
3703,0.004111,-0.009254,0.001905,0.004687,0.007900,-0.006425,0.001341,0.006951,0.010821,0.005809,-0.003340,0.004137,0.002478,-0.008563,0.002508,0.003072,0.002078,-0.003202,-0.000101,0.008898,0.007334,0.005783,0.004581,0.007870,0.006090,-0.001629,0.005794,0.009109,0.004222,0.007774,-0.007072,0.002482,0.005776,-0.006992,0.010144,0.007480,-0.001084,0.009248,0.002132,0.005582,3703,3950
3704,0.009575,-0.004880,0.002687,0.005160,0.009054,-0.005754,-0.000474,-0.001078,0.007084,0.000329,-0.004199,0.008029,-0.001672,-0.014337,0.000474,-0.002443,0.008420,0.003440,0.003274,0.007454,0.008010,0.003611,0.000062,0.001786,0.002844,0.001183,0.009164,0.010159,0.004380,0.002751,-0.005889,0.003077,0.001679,-0.007720,0.008584,0.004432,-0.001949,0.011409,-0.004800,0.004582,3704,3951


In [6]:
item_latent_df = item_latent_df.drop(['item_id'], axis=1)
dataset_all = pd.merge(item_latent_df, movies, on = 'movieId').dropna()
dataset_all.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,movieId,item_id,title,genres,year,genre,genre_count
0,0.046745,0.004665,0.041605,-0.045153,-0.006969,0.049296,-0.011195,-0.001268,-0.014303,0.026581,-0.034755,-0.019688,0.003687,-0.00215,0.022543,0.007811,0.025419,-0.015451,0.022414,0.007889,0.037029,0.024086,0.006834,0.000108,0.027037,0.041853,-0.019126,-0.005141,0.014016,-0.016257,-0.007346,-0.012583,-0.024129,-0.00233,0.014767,-0.002599,0.013485,0.0135,0.010503,0.015153,1,0,Toy Story (1995),Animation|Children's|Comedy,1995,"[Animation, Children's, Comedy]",3
1,-0.000197,0.017216,-0.001482,0.023131,-0.011494,0.022521,0.01467,-0.006329,-0.013746,0.035266,-0.01389,0.027044,0.008546,0.017434,-0.023165,0.013478,0.033705,-0.004714,0.011139,-0.022652,0.003465,0.016279,-0.025399,0.018919,0.01223,0.021709,-0.010909,0.021161,0.013243,0.050482,0.00282,0.005104,0.010459,0.00912,-0.00343,0.034801,0.005748,-0.018434,0.003431,0.014474,2,1,Jumanji (1995),Adventure|Children's|Fantasy,1995,"[Adventure, Children's, Fantasy]",3
2,-0.011725,0.026086,0.028532,0.003159,0.007351,-0.007402,0.012995,0.017656,0.021652,0.000577,0.004887,-0.002046,0.023741,0.008782,0.004038,-0.031399,0.027487,0.023028,-0.015516,-0.002818,0.026736,-8.6e-05,-0.00266,-0.00215,0.017108,-0.000832,0.024792,0.00346,0.012006,0.016066,0.00895,0.010968,0.008148,0.017594,-0.01037,0.021666,0.006869,-0.009042,0.009412,-0.007008,3,2,Grumpier Old Men (1995),Comedy|Romance,1995,"[Comedy, Romance]",2
3,0.011892,0.020287,0.001064,-0.003999,0.010913,0.01146,-0.00898,-0.004667,0.015053,-0.009862,0.004724,0.010715,0.003834,-0.002608,-0.006101,-0.011693,0.001903,0.026781,0.008089,-0.009439,0.008117,-0.003225,0.013765,0.005577,0.00899,0.002148,0.016476,-0.001379,0.003906,0.014562,0.011203,0.010161,0.005269,0.002876,0.007479,0.000672,0.012229,-0.005918,0.010025,0.009985,4,3,Waiting to Exhale (1995),Comedy|Drama,1995,"[Comedy, Drama]",2
4,-0.006257,0.020105,0.021775,-0.005948,-0.001758,0.018903,-4.9e-05,0.010384,0.009208,-0.002559,-0.000626,0.020402,-0.000748,0.00794,0.012041,-0.0076,0.004883,0.025656,-0.003049,-0.004923,0.009676,-0.003921,0.002669,0.0101,0.028021,-0.00614,0.02583,0.003281,-0.006023,0.015592,0.004909,0.016519,-0.004503,0.001049,0.011211,0.004705,0.017255,-0.016907,0.003034,-0.005289,5,4,Father of the Bride Part II (1995),Comedy,1995,[Comedy],1


In [7]:
# Filter 2 genres
dataset_two = dataset_all.loc[dataset_all['genre_count'].isin([2])]

# Filter 1 genres
dataset = dataset_all.loc[dataset_all['genre_count'].isin([1])]

In [8]:
# Add column movies_order
ratings_movies = pd.merge(ratings, movies, on = 'movieId', how = 'left' )
ratings_movies['count'] = 1
ratings_movies = ratings_movies.sort_values(by=['userId','timestamp'])
ratings_movies['movies_order'] = ratings_movies.groupby(by=['userId'])['count'].transform(lambda x: x.cumsum())
ratings_movies = ratings_movies.drop(['count'], axis=1)
ratings_movies.head()

Unnamed: 0,userId,movieId,rating,timestamp,item_id,title,genres,year,genre,genre_count,movies_order
31,1,3186,4,978300019,2969,"Girl, Interrupted (1999)",Drama,1999,[Drama],1,1
22,1,1270,5,978300055,1178,Back to the Future (1985),Comedy|Sci-Fi,1985,"[Comedy, Sci-Fi]",2,2
27,1,1721,4,978300055,1574,Titanic (1997),Drama|Romance,1997,"[Drama, Romance]",2,3
37,1,1022,5,978300055,957,Cinderella (1950),Animation|Children's|Musical,1950,"[Animation, Children's, Musical]",3,4
24,1,2340,3,978300103,2147,Meet Joe Black (1998),Romance,1998,[Romance],1,5


In [9]:
def find_user_path(selected_movies, n_before, n_after):
    """ Find user's path within 21 frame

    variables :
    selected_movies item_id
    n_before         number of previous watch movies
    n_after          number of movie in target genres after watch stepping stone movies

    return:
    user_path_df    dataset of user's path fall in criteria
    """

    filter_movies = ratings_movies[ratings_movies['movieId']==selected_movies] # Selected movies
    selected_genre = list(ratings_movies[ratings_movies['movieId']==selected_movies]['genres'].drop_duplicates())[0] # movie's genre
    num_movies_dict = {}

    # Check when user watch selected movies
    order_dict = {}
    for i in list(filter_movies.index.values):
      order_dict[filter_movies['userId'][i]] = filter_movies['movies_order'][i]

    # Get sequence after selected movies
    seq_df = pd.DataFrame()
    for i in list(order_dict.keys()):
      df_temp = ratings_movies.loc[(ratings_movies['userId']==i) & ((order_dict[i]-10) <= ratings_movies['movies_order']) & ((order_dict[i]+10) >= ratings_movies['movies_order'])]
      seq_df = pd.concat([seq_df, df_temp])

    genres_seq = seq_df.groupby('userId')['genre'].apply(lambda x: x.tolist()) # Get sequence of genres by user

    # User stay in the same path or not
    user_path_df = pd.DataFrame()
    for user in list(genres_seq.index.values):
      check_previous = 0
      num_movies = 0
      for j in genres_seq[user][:n_before]: # check previous movies not in selected genre
        if selected_genre not in j:
          check_previous += 1
      for k in genres_seq[user][(n_before + 1):]: # check after watched stepping stone at least n_after movies
        if selected_genre in k:
          num_movies += 1
      if check_previous == n_before and num_movies >= n_after:
         user_path_df = pd.concat([user_path_df, seq_df[seq_df['userId']==user][['userId','movieId','title','genre','timestamp']]])
         num_movies_dict[(selected_movies,user)] = num_movies
         
    return user_path_df, num_movies_dict

In [10]:
def user_path_two_genres(selected_movies, n_before, n_after):
    filter_movies = ratings_movies[ratings_movies['movieId']==selected_movies] # Selected movies
    selected_genre = list(ratings_movies[ratings_movies['movieId']==selected_movies]['genres'].drop_duplicates())[0] # movie's genre
    genres_list = list(selected_genre.split('|'))

    num_movies_dict = {}

    # Check when user watch selected movies
    order_dict = {}
    for i in list(filter_movies.index.values):
      order_dict[filter_movies['userId'][i]] = filter_movies['movies_order'][i]

    # Get sequence after selected movies
    seq_df = pd.DataFrame()
    for i in list(order_dict.keys()):
      df_temp = ratings_movies.loc[(ratings_movies['userId']==i) & ((order_dict[i]-10) <= ratings_movies['movies_order']) & ((order_dict[i]+10) >= ratings_movies['movies_order'])]
      seq_df = pd.concat([seq_df, df_temp])

    genres_seq = seq_df.groupby('userId')['genre'].apply(lambda x: x.tolist()) # Get sequence of genres by user
    
    # User stay in the same path or not
    user_path_df = pd.DataFrame()
    for user in list(genres_seq.index.values):
      for g in range(len(genres_list)):
        count_1 = 0
        count_2 = 0
        num_movies = 0
        for j in genres_seq[user][:n_before]: 
          if genres_list[g] not in j: # check user never watched first genres before
            count_1 += 1
          if genres_list[(g+1)*abs(g-1)] in j: # check user watched second genres before
            count_2 += 1
        for k in genres_seq[user][(n_before + 1):]: # check after watched stepping stone at least n_after movies (ignore sequence)
          if genres_list[g] in k: # check user watched first genres after stepping tones
            num_movies += 1
        if count_1 == n_before and count_2 >= 1 and num_movies >= n_after:
          user_path_df = pd.concat([user_path_df, seq_df[seq_df['userId']==user][['userId','movieId','title','genre','timestamp']]])
          num_movies_dict[(selected_movies,user)] = num_movies
          
    return user_path_df, num_movies_dict

In [11]:
def distance_cal(genres, n_before, n_after, item_latent_df):
    """ Calculate distance from stepping stone to previoues n_before movies

    variables :
    genres           selected genres
    n_before         number of previous watch movies
    n_after          number of movie in target genres after watch stepping stone movies

    return:
    user_path_df    dataset of user's path fall in criteria
    distance_dict   key [movies,userid] value = [distance]
    num_movies_dict key [movies,userid] value = [number of movies in selected genres after watch stepping stone movie]
    """

    # Create list of all movies in selected genre
    if len(genres.split('|')) == 1:
      all = set(dataset[dataset['genres']==genres]['movieId'])
    else :
      all = set(dataset_two[dataset_two['genres']==genres]['movieId'])

    # Create data frame to contains all user path
    user_path_df = pd.DataFrame()

    # Create a dict to contains results distance
    distance_dict = {}
    num_movies_dict = {}

    # For each movies get user path
    for selected_movies in all:
      if len(genres.split('|')) == 1:
        temp_df, num_movies_dict_temp = find_user_path(selected_movies, n_before, n_after)
      else:
        temp_df, num_movies_dict_temp = user_path_two_genres(selected_movies, n_before, n_after)
      num_movies_dict.update(num_movies_dict_temp)
      user_path_df = pd.concat([user_path_df, temp_df])

      if len(temp_df) != 0:

        # Get embedding values
        temp_df = pd.merge(temp_df, item_latent_df, on = 'movieId', how = 'left') 

        # Genres sequence
        genres_seq = temp_df.groupby('userId')['genre'].apply(lambda x: x.tolist()) 

        # For each user calculate the avearge distance and count number of movies
        for user in list(temp_df['userId'].unique()):
          distance_list = []
          # Get array of stepping stone movies
          target_movie = temp_df[(temp_df['userId']==user) & (temp_df['movieId'] == selected_movies)][np.r_[0:40]].to_numpy()
          # Calculate distance from steping stone movies to previous movies
          for pos in range(n_before):
            previous_movie = temp_df[temp_df['userId']==user][pos:pos+1][np.r_[0:40]].to_numpy()
            distance_list.append(np.linalg.norm(target_movie - previous_movie))
          
          distance_dict[(selected_movies,user)] = distance_list

    return user_path_df, distance_dict, num_movies_dict

In [12]:
def cal_correlation(distance_dict, num_movies_dict, n_before):
    """ Calculate correlation between distance and number of movies """

    last_one = []
    last_three = []
    last_five = []
    last_all = []
    num_movies = []
    corr_list = []

    if len(distance_dict) < 2: # No user fall into changing path or only one user change (can't calculate correlation)
      corr_list = [['Not available','Not available','Not available','Not available']]

    else:
      for keys in distance_dict.keys():
        last_one.append(distance_dict[keys][n_before-1])
        last_three.append(mean(distance_dict[keys][n_before-3:n_before]))
        last_five.append(mean(distance_dict[keys][n_before-5:n_before]))
        last_all.append(mean(distance_dict[keys]))
        num_movies.append(num_movies_dict[keys])

      # calculate Pearson's correlation between avg distance and number of movies
      corr1, _ = pearsonr(last_one, num_movies)
      corr3, _ = pearsonr(last_three, num_movies)
      corr5, _ = pearsonr(last_five, num_movies)
      corrall, _ = pearsonr(last_all, num_movies)
      
      # Return all caculated correlation 
      corr_list.append([corr1,corr3,corr5,corrall])

    return corr_list

In [13]:
n_before = 10
n_after = 0

# Calculate correlation
user_path_diff = pd.DataFrame()
number_user = {}
correlation = {}

for g in unique_genres:
  print(g)
  temp_df, distance_dict, num_movies_dict = distance_cal(g, n_before, n_after, item_latent_df)
  user_path_diff = pd.concat([user_path_diff, temp_df])
  number_user[g] = len(temp_df['userId'].unique())
  correlation[g] = cal_correlation(distance_dict, num_movies_dict, n_before)

War
Drama
Film-Noir
Comedy
Adventure
Sci-Fi
Children's
Animation
Documentary
Western
Action
Romance
Musical
Thriller
Fantasy
Mystery
Crime
Horror


In [14]:
# Create a result dataframe
num_df = pd.DataFrame(number_user.items(), columns=['Genre', 'Number of users'])
corr_df = pd.DataFrame(correlation.items(), columns=['Genre', 'Corr'])
corr_df['Corr'] = corr_df['Corr'].str[0]
corr_df[['Last_one','Last_three','Last_five','all']] = pd.DataFrame(corr_df.Corr.tolist())
corr_df = corr_df.drop(columns=['Corr'])
corr_df = pd.merge(num_df, corr_df, on = 'Genre', how = 'left') 
corr_df

Unnamed: 0,Genre,Number of users,Last_one,Last_three,Last_five,all
0,War,252,0.100549,0.141422,0.11988,0.131951
1,Drama,1008,0.125507,0.144304,0.156802,0.161021
2,Film-Noir,216,0.0933708,0.171151,0.158859,0.13523
3,Comedy,1142,0.0970413,0.101183,0.107338,0.105595
4,Adventure,300,0.0517165,0.0571634,0.0989382,0.0978812
5,Sci-Fi,669,0.103845,0.061074,0.0710153,0.0635827
6,Children's,61,0.00842043,0.0146438,0.0275055,0.0195525
7,Animation,227,-0.0909887,-0.0402891,-0.0421301,-0.0220867
8,Documentary,1697,0.0314226,0.03872,0.0407128,0.0529997
9,Western,1298,-0.0107422,0.0304463,0.0350219,0.050439


In [15]:
# Calculate correlation
user_path_diff_2 = pd.DataFrame()
number_user_2 = {}
correlation_2 = {}

for g in two_genres:
  print(g)
  temp_df, distance_dict, num_movies_dict = distance_cal(g, n_before, n_after, item_latent_df)
  user_path_diff_2 = pd.concat([user_path_diff_2, temp_df])
  if len(temp_df) == 0:
    number_user_2[g] = 0
    correlation_2[g] = [['Not available','Not available','Not available','Not available']]
  else:
    number_user_2[g] = len(temp_df['userId'].unique())
    correlation_2[g] = cal_correlation(distance_dict, num_movies_dict, n_before)

Action|Romance
Drama|Romance
Action|Sci-Fi
Adventure|Fantasy
Horror|Romance
Action|Crime
Children's|Horror
Animation|Children's
Documentary|Drama
Comedy|Thriller
Adventure|Sci-Fi
Action|Children's
Children's|Fantasy




Action|Comedy
Comedy|Romance
Comedy|Crime
Children's|Comedy
Mystery|Sci-Fi
Action|Drama
Comedy|Drama
Musical|War
Crime|Drama
Adventure|Musical
Crime|Film-Noir
Romance|War
Sci-Fi|Thriller
Crime|Horror
Sci-Fi|War
Animation|Mystery
Action|Horror
Comedy|Documentary
Fantasy|Sci-Fi
Drama|Horror
Children's|Drama
Comedy|Horror
Action|Adventure
Adventure|Comedy
Comedy|Sci-Fi
Musical|Romance
Adventure|Drama
Romance|Western
Comedy|Mystery
Action|Western
Comedy|Fantasy
Adventure|Western
Documentary|War
Drama|War
Adventure|War
Documentary|Musical
Mystery|Thriller
Action|War
Drama|Thriller
Comedy|Western
Film-Noir|Mystery
Drama|Western
Action|Thriller
Drama|Film-Noir
Romance|Thriller
Comedy|War
Adventure|Romance
Horror|Mystery
Crime|Mystery
Film-Noir|Horror
Horror|Sci-Fi
Drama|Sci-Fi
Crime|Thriller
Animation|Sci-Fi
Animation|Comedy
Animation|Musical
Children's|Musical
Film-Noir|Sci-Fi
Comedy|Musical
Drama|Fantasy
Drama|Musical
Drama|Mystery
Children's|Sci-Fi
Horror|Thriller
Film-Noir|Thriller
Advent

In [16]:
num_df_2 = pd.DataFrame(number_user_2.items(), columns=['Genre', 'Number of users'])
corr_df_2 = pd.DataFrame(correlation_2.items(), columns=['Genre', 'Corr'])
corr_df_2['Corr'] = corr_df_2['Corr'].str[0]
corr_df_2[['Last_one','Last_three','Last_five','all']] = pd.DataFrame(corr_df_2.Corr.tolist())
corr_df_2 = corr_df_2.drop(columns=['Corr'])
corr_df_2 = pd.merge(num_df_2, corr_df_2, on = 'Genre', how = 'left') 
corr_df_2

Unnamed: 0,Genre,Number of users,Last_one,Last_three,Last_five,all
0,Action|Romance,476,0.204535,0.204093,0.196051,0.210659
1,Drama|Romance,2378,0.000143751,0.0172733,0.0149697,0.00555553
2,Action|Sci-Fi,1109,0.0620948,0.0851169,0.0979235,0.101038
3,Adventure|Fantasy,131,-0.0283609,-0.0907707,-0.183031,-0.144511
4,Horror|Romance,894,0.00699762,-0.000559323,0.02292,0.0416166
...,...,...,...,...,...,...
75,Children's|Sci-Fi,88,-0.0771,-0.107345,-0.0852764,0.0482011
76,Horror|Thriller,1544,-0.0370679,-0.0340661,-0.0349944,-0.0401791
77,Film-Noir|Thriller,491,0.046297,-0.015847,0.00247177,0.00425364
78,Adventure|Children's,404,-0.0417349,-0.0332183,-0.0395811,-0.0520319


In [17]:
corr_df_2.to_csv('/content/drive/My Drive/correlation.csv', index = False, header=True)