In [1]:
import pandas as pd
import numpy as np
import math
import re
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
import time
import networkx as nx
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import pylab 
import scipy.stats as stats

In [2]:
def data_read():
    '''
    This function is to read data from files. Since I only need top 1M, I take only df1
    '''
    df1 = pd.read_csv('data/combined_data_1.txt', header = None, names = ['Cust_Id', 'Rating',"Date"], usecols = [0,1,2])
    #df2 = pd.read_csv('data/combined_data_2.txt', header = None, names = ['Cust_Id', 'Rating',"Date"], usecols = [0,1,2])
    #df3 = pd.read_csv('data/combined_data_3.txt', header = None, names = ['Cust_Id', 'Rating'], usecols = [0,1])
    #df4 = pd.read_csv('data/combined_data_4.txt', header = None, names = ['Cust_Id', 'Rating'], usecols = [0,1])

    df1['Rating'] = df1['Rating'].astype(float)

    print('Dataset 1 shape: {}'.format(df1.shape))
    print('-Dataset examples-')
    print(df1.iloc[::5000000, :])
    
    df = df1
    #df = df.append(df2)
    #df = df.append(df3)
    #df = df.append(df4)
    print(df)
    return df

In [3]:
def data_purify(df):
    '''
    Next section of codes is to extract the movie id for each of the data.
    top 1M data is selected.
    '''
    start = time.time()
    df = df.reset_index()
    df_nan = pd.DataFrame(pd.isnull(df.Rating))
    df_nan = df_nan[df_nan['Rating'] == True]
    df_nan = df_nan.reset_index()
    #print(df_nan[df_nan["index"] == 0])
    
    movie_np = []
    movie_id = 1

    for i,j in zip(df_nan['index'][1:],df_nan['index'][:-1]):
        # numpy approach
        temp = np.full((i-j-1), movie_id)
        movie_np = np.append(movie_np, temp)
        movie_id += 1

    # Account for last record and corresponding length
    # numpy approach
    last_record = np.full((1,len(df) - df_nan.iloc[-1, 0] - 1),movie_id)
    movie_np = np.append(movie_np, last_record)

    end = time.time()
    print("time is ", end-start)
    print('Movie numpy: {}'.format(movie_np))
    print('Length: {}'.format(len(movie_np)))
    
    df = df[pd.notnull(df['Rating'])]
    df['Movie_Id'] = movie_np.astype(int)
    df['Cust_Id'] = df['Cust_Id'].astype(int)
    print('-Dataset examples-')
    print(df.iloc[::5000000, :])
    df = df.iloc[:1000000, :]
    return df

In [4]:
def group_by_time(df):    
    '''
    Next section of code is to sort and group data by time
    '''
    df["Date"] = pd.to_datetime(df["Date"])
    df.sort_values(by=['Date'])
    group = df.groupby(df['Date'].map(lambda x:x.year))
    for key, con in group:
        print(key, con)
    group_dic = [v for _,v in group]
    return group_dic

In [5]:
def data_preprocessing():
    '''
    
    returns:
        dict: group_dic. This is a dictionary which contains all top 1M data grouped by years
    
    This function is used to preprocess the data.
    
    '''
    df = data_read()
    df = data_purify(df)
    group_dic = group_by_time(df)
    
    return group_dic   
    
    
    

In [7]:
'''
The section of codes consists of 4 functions used in network projections.

'''
def df_to_edges(df):
    temp_df = df[["Cust_Id", "Movie_Id", "Rating"]]
    temp_df["Cust_Id"] = df["Cust_Id"].astype(str)
    temp_df["Movie_Id"] = df["Movie_Id"].astype(int)
    ed = [tuple(row) for _,row in temp_df.iterrows()]
    return ed

def df_to_nodes(df):
    G = nx.Graph()
    G.add_nodes_from(pd.unique(df["Cust_Id"]).astype(str), bipartite=0)
    G.add_nodes_from(pd.unique(df["Movie_Id"]), bipartite=1)
    return G

def proj(B, nodes):
    pred = B.adj
    G = nx.Graph()
    G.graph.update(B.graph)
    G.add_nodes_from((n, B.nodes[n]) for n in nodes)

    n_top = float(len(B) - len(nodes))

    for u in nodes:
        unbrs = set(B[u])
        nbrs2 = {n for nbr in unbrs for n in B[nbr]} - {u}
        for v in nbrs2:
            vnbrs = set(pred[v])
            common = unbrs & vnbrs
            total_dis = 0
            for co in common:
                total_dis += abs(B.edges[u, co]["weight"] - B.edges[v, co]["weight"]) + 1
            weight = total_dis/len(common)
            G.add_edge(u, v, weight=len(common), dist=weight)
    return G

def projection(B):
    top_nodes = {n for n, d in B.nodes(data=True) if d["bipartite"] == 0}
    bottom_nodes = set(B) - top_nodes
    new_B = proj(B, bottom_nodes)
    return new_B

In [None]:
def get_image(group_dic, index, time_length='year'):
    '''
    params:
        dict: group_dic. This is a dictionary which contains all top 1M data grouped by years
        
        int: index. It is the index for the saving of image.
        
        string: time_length. It is the time_length for the group. Default 'year'
        
    
    returns:
        networkx_object: B. This is a network with 2 edge properties, distance and weight.
        
    
    This function does four things:
        1. Draw and save a graph of the bipartite network for users and movies
        2. Draw and save a graph of the projected network for the movies
        3. Draw and save a graph of the heat map for the weights of the projected movies.
        4. return the projected movie B.
    '''
    
    
    if index >= len(group_dic):
        print("index out of range")
        return False
    ddf = group_dic[index]
    GG = df_to_nodes(ddf)
    GG.add_weighted_edges_from(df_to_edges(ddf))

    
    #Get the bipartite graphs.
    top_nodes = {n for n, d in GG.nodes(data=True) if d["bipartite"] == 0}
    bottom_nodes = set(GG) - top_nodes
    pos = nx.bipartite_layout(GG, top_nodes)
    plt.figure()
    
    nx.draw_networkx_nodes(GG, nodelist=top_nodes, pos=pos, node_shape="d", node_size=5, linewidths=0.05)
    nx.draw_networkx_nodes(GG, nodelist=bottom_nodes, pos=pos, node_shape="o", node_size=5, linewidths=0.05)
    nx.draw_networkx_edges(GG, pos=pos, width=0.1)
    statement = "plot/bi/bipartite graph " + time_length + " for the " + str(index) + " group.png"
    plt.savefig(statement, dpi=300, bbox_inches='tight')
    
    #define a new projection to get the dist and weight
    B = projection(GG)
    edges_list = list(B.edges(data=True))
    #print(edges_list)
    #draw the projection graphs
    pos = nx.random_layout(B)
    #weights = [B[u][v]['weight']/5 for u,v in B.edges]
    plt.figure()
    nx.draw(B, pos=pos, width=0.01)
    statement = "plot/pro/projected graph " +  time_length + " for the " + str(index) + " group.png"
    plt.savefig(statement, dpi=300, bbox_inches='tight')

    #Get new df
    df_new = pd.DataFrame(data=0, columns=df["Movie_Id"].unique(), index=df["Movie_Id"].unique())

    #Draw the heat map of the new df
    for edge in edges_list:
        df_new[edge[0]][edge[1]] = edge[2]["weight"]
        df_new[edge[1]][edge[0]] = edge[2]["weight"]

    plt.figure()
    fig = sns.heatmap(data=df_new,square=True, cmap="RdBu_r", vmin=0, vmax=20000)
    heat_fig = fig.get_figure()
    statement = "plot/heat/heat map of the degrees " +  time_length + " for the " + str(index) + " group.png"
    heat_fig.savefig(statement, dpi=300, bbox_inches='tight')
    
    return B