In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

import gc

In [None]:
gc.collect()

In [None]:
print("Reading the train and test files")
train_prod = pd.read_csv("../data/train.csv")
test_prod = pd.read_csv("../data/test.csv")

print("Reading the interaction files")
interaction_swipe = pd.read_csv('../data/interaction_swipes.csv')
interaction_review_strength = pd.read_csv("../data/interaction_review_strengths.csv")
interaction_review_comments = pd.read_csv("../data/interaction_review_comments_300dims.csv")

print("Reading the User files")
user_strengths = pd.read_csv("../data/user_strengths.csv")
user_ages = pd.read_csv("../data/user_ages.csv")
user_educations = pd.read_csv("../data/user_educations.csv")
user_purpose = pd.read_csv("../data/user_purposes.csv")
user_self_intro = pd.read_csv("../data/user_self_intro_vectors_300dims.csv")
user_sessions = pd.read_csv("../data/user_sessions.csv")
user_skills = pd.read_csv("../data/user_skills.csv")
user_works = pd.read_csv("../data/user_works.csv")

print("All files read")
print("")
print(train_prod.shape, test_prod.shape)

In [None]:
print("Reading the train and test files")
train_prod = pd.read_csv("../data/train.csv")
test_prod = pd.read_csv("../data/test.csv")

print("Reading the interaction files")
interaction_swipe = pd.read_csv('../data/interaction_swipes.csv')

# Splitting the from-to in Train and Test prod 

In [None]:
print("Splitting the to and from in the train and test dataset")

train_prod[['from', 'to']] = pd.DataFrame(train_prod['from-to'].str.split("-").tolist(), index= train_prod.index)
train_prod = train_prod[['from-to', 'from', 'to', 'score']]

test_prod[['from', 'to']] = pd.DataFrame(test_prod['from-to'].str.split("-").tolist(), index= test_prod.index)

train_prod['to'] = train_prod['to'].astype('int')
train_prod['from'] = train_prod['from'].astype('int')
test_prod['to'] = test_prod['to'].astype('int')
test_prod['from'] = test_prod['from'].astype('int')

print("")

# Interaction data

### Interaction swipe

In [None]:
print("Removing the duplicates in the interaction swipe and converting the date to timestamp format")

print(interaction_swipe.shape)
interaction_swipe = interaction_swipe[~interaction_swipe.duplicated()]
print(interaction_swipe.shape)
interaction_swipe['timestamp'] = interaction_swipe.timestamp.astype('str')
interaction_swipe['timestamp'] = pd.to_datetime(interaction_swipe.timestamp,  errors='coerce')

In [None]:
# Split the from and to from the from-to 
print("Splitting the to and from in in the interaction swipe dataset")

interaction_swipe[['from', 'to']] = pd.DataFrame(interaction_swipe['from-to'].str.split("-").tolist(), index= interaction_swipe.index)
interaction_swipe = interaction_swipe[['from-to', 'from', 'to', 'timestamp', 'swipe_status']]

interaction_swipe['to'] = interaction_swipe['to'].astype('int')
interaction_swipe['from'] = interaction_swipe['from'].astype('int')
interaction_swipe

In [None]:
print("Getting the date features the interaction swipe dataset")

interaction_swipe['hour'] = interaction_swipe.timestamp.dt.hour
interaction_swipe['year'] = interaction_swipe.timestamp.dt.year
interaction_swipe['month'] = interaction_swipe.timestamp.dt.month
interaction_swipe['day'] = interaction_swipe.timestamp.dt.day
interaction_swipe['weekday'] = interaction_swipe.timestamp.dt.weekday
interaction_swipe['weekends'] = np.where(interaction_swipe.weekday>=5, 1, 0)
interaction_swipe.dropna(inplace=True)
interaction_swipe

In [None]:
print("Getting the First and last swipe of the from and to user in the interaction swipe dataset")

def first_last_swipe_year(df, side):
    temp = df.groupby([side]).agg(first_swipe_year = ('year', 'min'),
                                  last_swipe_year = ('year', 'max')).reset_index()
    temp['swipe_year_difference'] = temp.last_swipe_year - temp.first_swipe_year
    temp.columns = [side, side+'_first_swipe_year', side+'_last_swipe_year', side+'_swipe_year_difference']
    return temp

from_first_last_swipe_year = first_last_swipe_year(df=interaction_swipe.copy(), side='from')
to_first_last_swipe_year = first_last_swipe_year(df=interaction_swipe.copy(), side='to')

from_first_last_swipe_year

In [None]:
print("Getting the total swipe days of the from and to user in the interaction swipe dataset")

def total_swipe_days(df, side):
    temp = df.groupby([side, 'year', 'month', 'day']).agg(unique_days = ('day', 'nunique')).reset_index()
    temp = temp.groupby([side]).agg(total_swipe_days = ('unique_days', 'sum')).reset_index()
    temp.columns = [side, side+'_total_swipe_days']
    return temp

from_total_swipe_days = total_swipe_days(df=interaction_swipe.copy(), side='from')
to_total_swipe_days = total_swipe_days(df=interaction_swipe.copy(), side='to')

from_total_swipe_days

In [None]:
print("Getting the weekday, weekend interaction of the from and to user in the interaction swipe dataset")

def weekend_interaction(df, side):
    '''
    This function is about gettting the count of total swipes right/left during the weekdays and weekends by "from".
    Likewise how many time "to" was swipped left or right during the during the weekday and weekends.
    '''
    temp = df.groupby([side, 'weekends']).agg(weekend_interaction_count = ('weekends', 'count')).reset_index()
    temp = temp.pivot(index=side, columns='weekends', values=['weekend_interaction_count']).reset_index().fillna(0)
    temp.columns = [side, side+'_weekday_interaction_count', side+'_weekend_interaction_count']
    
    total = temp[side +'_weekday_interaction_count'] + temp[side +'_weekend_interaction_count']
    temp[side +'_weekday_interaction_percentage'] = temp[side+'_weekday_interaction_count']/total
    temp[side +'_weekend_interaction_percentage'] = temp[side+'_weekend_interaction_count']/total
    
    return temp
    
from_weekend_interaction = weekend_interaction(df=interaction_swipe.copy(), side='from')
to_weekend_interaction = weekend_interaction(df=interaction_swipe.copy(), side='to')
to_weekend_interaction

In [None]:
print("Getting the dayswise interaction of the from and to user in the interaction swipe dataset")

def day_wise_count(df, side):
    temp = df.groupby([side, 'weekday']).agg(day_wise_count=('weekday', 'count')).reset_index()
    
    temp = temp.pivot(index=side, columns='weekday', values=['day_wise_count']).reset_index().fillna(0)
    temp.columns = [side, 
                    side+'_weekday0_interaction_count', 
                    side+'_weekday1_interaction_count',
                    side+'_weekday2_interaction_count',
                    side+'_weekday3_interaction_count',
                    side+'_weekday4_interaction_count',
                    side+'_weekday5_interaction_count',
                    side+'_weekday6_interaction_count'
                   ]
    
    total = temp[[side+'_weekday0_interaction_count', side+'_weekday1_interaction_count',
                  side+'_weekday2_interaction_count', side+'_weekday3_interaction_count',
                  side+'_weekday4_interaction_count', side+'_weekday5_interaction_count',
                  side+'_weekday6_interaction_count']].sum(axis=1)
    
    temp[side +'_weekday0_interaction_percentage'] = temp[side+'_weekday0_interaction_count']/total
    temp[side +'_weekday1_interaction_percentage'] = temp[side+'_weekday1_interaction_count']/total
    temp[side +'_weekday2_interaction_percentage'] = temp[side+'_weekday2_interaction_count']/total
    temp[side +'_weekday3_interaction_percentage'] = temp[side+'_weekday3_interaction_count']/total
    temp[side +'_weekday4_interaction_percentage'] = temp[side+'_weekday4_interaction_count']/total
    temp[side +'_weekday5_interaction_percentage'] = temp[side+'_weekday5_interaction_count']/total
    temp[side +'_weekday6_interaction_percentage'] = temp[side+'_weekday6_interaction_count']/total
    
    return temp

from_daywise_interaction = day_wise_count(df=interaction_swipe.copy(), side='from')
to_daywise_interaction = day_wise_count(df=interaction_swipe.copy(), side='to')
from_daywise_interaction

In [None]:
print("Getting the swipe left, right count, percentage and total swipe count of the from and to user in the interaction swipe dataset")

def swipe_counts(df, side):
    'This function is how many times a user either in from or to has the swiped -1 or 1'
    
    df = df.groupby([side, 'swipe_status']).agg(total_swipes= (side, 'count')).reset_index()
    
    temp = df.pivot(index=side, columns='swipe_status', values=['total_swipes']).reset_index()
    temp.columns = [side, side+'_swipe_left_count', side+'_swipe_right_count']
    
    temp[side +'_total_swipe_counts'] = temp[[side+'_swipe_left_count', side+'_swipe_right_count']].sum(axis=1)
    
    temp[side +'_left_percentage'] = temp[side+'_swipe_left_count']/temp[side +'_total_swipe_counts']
    temp[side +'_right_percentage'] = temp[side+'_swipe_right_count']/temp[side +'_total_swipe_counts']
    
    temp.fillna(0, inplace=True)
    
    return temp

interaction_from_swipe_status_counts = swipe_counts(df=interaction_swipe, side='from')
interaction_to_swipe_status_counts = swipe_counts(df=interaction_swipe, side='to')

print(interaction_from_swipe_status_counts.shape, interaction_to_swipe_status_counts.shape)
interaction_from_swipe_status_counts

In [None]:
print("Getting the time stats of swipes of the from and to user in the interaction swipe dataset")

def time_since_swipe_stat(df, side):
    'This function is to get the timestamp stat between the swipe interaction of "from" and "to" user '
    'which can be either 1 or -1'
    
    # Sorting it by form and timestamp
    df = df.sort_values([side, 'timestamp'])
    df['lag_time'] = df.groupby([side]).timestamp.shift(1)
    df['time_since_last_swipe'] = df.timestamp - df.lag_time
    df['time_since_last_swipe'] = df['time_since_last_swipe'].dt.seconds
    df['time_since_last_swipe'] = df['time_since_last_swipe'].fillna(0)
    df.drop(['lag_time'], inplace=True, axis=1)

    df = df.groupby([side]).agg(mean_time_since_last_swipe=('time_since_last_swipe', 'mean'),
                                median_time_since_last_swipe=('time_since_last_swipe', 'median'),
                                max_time_since_last_swipe=('time_since_last_swipe', 'max')).reset_index()
    df.columns = [side, side+'_mean_time_since_last_swipe',
                  side+'_median_time_since_last_swipe', 
                  side+'_max_time_since_last_swipe']

    return df

from_swipe_time_stat = time_since_swipe_stat(df=interaction_swipe.copy(), side='from')
to_swipe_time_stat = time_since_swipe_stat(df=interaction_swipe.copy(), side='to')

print(from_swipe_time_stat.shape, to_swipe_time_stat.shape)
from_swipe_time_stat

In [None]:
print("Getting the time since last left or right swipes of the from and to user in the interaction swipe dataset")

def time_since_left_right_swipe_stat(df, side):
    '''
    This function is to get the timestamp stat between the swipe interaction of "from" and "to" user
    from their last 1 or -1
    '''
    
    # Sorting it by form and timestamp
    df = df.sort_values([side, 'swipe_status', 'timestamp'])

    df['lag_time'] = df.groupby([side, 'swipe_status']).timestamp.shift(1)
    df['time_since_last_swipe'] = df.timestamp - df.lag_time

    df['time_since_last_swipe'] = df['time_since_last_swipe'].dt.seconds
    df['time_since_last_swipe'] = df['time_since_last_swipe'].fillna(0)

    df = df.groupby([side, 'swipe_status']).agg(mean_time_since_last_swipe=('time_since_last_swipe', 'mean'),
                                                median_time_since_last_swipe=('time_since_last_swipe', 'median'),
                                                max_time_since_last_swipe=('time_since_last_swipe', 'max')).reset_index()

    df = df.pivot(index=side, columns='swipe_status', values=['mean_time_since_last_swipe',
                                                              'median_time_since_last_swipe',
                                                              'max_time_since_last_swipe']).reset_index()
    
    df.columns = [side, side+'_swipe_left_mean_time', side+'_swipe_right_mean_time',
                  side+'_swipe_left_median_time', side+'_swipe_right_median_time',
                  side+'_swipe_left_max_time', side+'_swipe_right_max_time']

    return df

from_swipe_time_since_left_right = time_since_left_right_swipe_stat(df=interaction_swipe.copy(), side='from')
to_swipe_time_since_left_right = time_since_left_right_swipe_stat(df=interaction_swipe.copy(), side='to')

print(from_swipe_time_since_left_right.shape, to_swipe_time_since_left_right.shape)
from_swipe_time_since_left_right

In [None]:
print("Getting the users age in the app from interaction swipe dataset")

def user_swipe_age_in_app(df, side):
    df = df.groupby([side]).agg(min_time = ('timestamp', 'min'),
                                max_time = ('timestamp', 'max')).reset_index()
    df[side+'_swipe_age_in_app'] = (df.max_time - df.min_time).dt.seconds
    df = df[[side, side + '_swipe_age_in_app']].copy()
    
    return df

from_user_age_in_app = user_swipe_age_in_app(df=interaction_swipe.copy(), side='from')
to_user_age_in_app = user_swipe_age_in_app(df=interaction_swipe.copy(), side='to')

to_user_age_in_app

In [None]:
# Merging all the swipe interaction from and to stats

to_stats = to_swipe_time_stat.merge(to_swipe_time_since_left_right, left_on='to', right_on='to', how='inner')
print(to_stats.shape)
to_stats = to_stats.merge(interaction_to_swipe_status_counts, left_on='to', right_on='to', how='inner')
print(to_stats.shape)
to_stats = to_stats.merge(to_user_age_in_app, left_on='to', right_on='to', how='inner')
print(to_stats.shape)
to_stats = to_stats.merge(to_weekend_interaction, left_on='to', right_on='to', how='inner')
print(to_stats.shape)
to_stats = to_stats.merge(to_daywise_interaction, left_on='to', right_on='to', how='inner')
print(to_stats.shape)
to_stats = to_stats.merge(to_total_swipe_days, left_on='to', right_on='to', how='inner')
print(to_stats.shape)
to_stats = to_stats.merge(to_first_last_swipe_year, left_on='to', right_on='to', how='inner')
print(to_stats.shape)
print("")

from_stats = from_swipe_time_stat.merge(from_swipe_time_since_left_right, left_on='from', right_on='from', how='inner')
print(from_stats.shape)
from_stats = from_stats.merge(interaction_from_swipe_status_counts, left_on='from', right_on='from', how='inner')
print(from_stats.shape)
from_stats = from_stats.merge(from_user_age_in_app, left_on='from', right_on='from', how='inner')
print(from_stats.shape)
from_stats = from_stats.merge(from_weekend_interaction, left_on='from', right_on='from', how='inner')
print(from_stats.shape)
from_stats = from_stats.merge(from_daywise_interaction, left_on='from', right_on='from', how='inner')
print(from_stats.shape)
from_stats = from_stats.merge(from_total_swipe_days, left_on='from', right_on='from', how='inner')
print(from_stats.shape)
from_stats = from_stats.merge(from_first_last_swipe_year, left_on='from', right_on='from', how='inner')
print(from_stats.shape)


### Interaction review_strengths

In [None]:
interaction_review_strength

In [None]:
def total_interaction_review_strength_preprocess(df):
    
    df = pd.get_dummies(df.set_index('from-to')['strength_id']).add_prefix('rev_strength_').sum(level=0)
    df = df.reset_index()
    df['total_interaction_review_strength_assessed'] = df.iloc[:, 1:].sum(axis=1)
    
    return df

interaction_review_strength = total_interaction_review_strength_preprocess(df=interaction_review_strength)

In [None]:
interaction_review_strength

### Interaction review comments

In [None]:
interaction_review_comments

# User data

### User age

In [None]:
user_ages[user_ages.age<18] = 18
user_ages[user_ages.age>55] = 55
user_ages

### User educations

In [None]:
user_educations

In [None]:
def user_education_preprocess(df):
    print("Before dropping duplicates", df.shape)
    df = df.drop_duplicates()
    df = df.sort_values(['user_id', 'degree_id']).reset_index(drop=True)
    print("After dropping duplicates",df.shape)
    
    # Filling the NA with forward fill
    df['temp_degree_id'] = df.groupby(['user_id']).degree_id.ffill()
    df.temp_degree_id[df.degree_id.isnull()] = df['temp_degree_id']+1

    # Filling the rest of the Null values left
    df['temp_deg_count'] = df.groupby(['user_id']).user_id.apply(lambda x: x.expanding().count())
    df['final_degree_id'] = np.where(df.temp_degree_id.isnull(), 
                                     df.temp_deg_count, df.temp_degree_id)
    df.drop(['temp_degree_id', 'temp_deg_count', 'degree_id'], axis=1, inplace=True)

    return df

user_educations = user_education_preprocess(df=user_educations)
user_educations.head(20)

user_educations_v2 = user_educations.groupby(['user_id']).agg(education_count=('user_id','count'),
                                                              unique_school_count=('school_id','nunique'),
                                                              unique_degree_count=('final_degree_id','nunique')).reset_index()

user_educations_v2

### User purpose

In [None]:
def user_purpose_preprocess(df):
    # From our EDA we saw there were values > 1. So let's fill all of them as 1 for now 
    df.iloc[:, 1:-1] = df.iloc[:, 1:16].replace([2, 3, 4, 5, 6], 
                                                [1, 1, 1, 1, 1])
    df['total_purpose'] = df.iloc[:, 1:16].sum(axis=1)
    return df

user_purpose_v2 = user_purpose_preprocess(df=user_purpose.copy())
user_purpose_v2

### user_self_intro

In [None]:
user_self_intro

### User session

In [None]:
user_sessions.timestamp = pd.to_datetime(user_sessions.timestamp)

user_sessions['hour'] = user_sessions.timestamp.dt.hour
user_sessions['year'] = user_sessions.timestamp.dt.year
user_sessions['month'] = user_sessions.timestamp.dt.month
user_sessions['day'] = user_sessions.timestamp.dt.day
user_sessions['weekday'] = user_sessions.timestamp.dt.weekday
user_sessions['weekends'] = np.where(user_sessions.weekday>=5, 1, 0)
user_sessions.dropna(inplace=True)

user_sessions

In [None]:
print("Getting the users session age in the app from session dataset")

def user_session_age_in_app(df):
    df = df.groupby(['user_id']).agg(min_time = ('timestamp', 'min'),
                                     max_time = ('timestamp', 'max')).reset_index()
    df['session_age_in_app'] = (df.max_time - df.min_time).dt.seconds
    df = df[['user_id', 'session_age_in_app']].copy()
    
    return df

user_session_age_in_app = user_session_age_in_app(df=user_sessions.copy())
user_session_age_in_app

In [None]:
print("Getting the swipe left, right count, percentage and total swipe count of the from and to user in the interaction swipe dataset")

def get_session_counts(df):
    'This function is how many times a user had a session'
    
    temp = df.groupby(['user_id']).agg(total_session_count= ('user_id', 'count')).reset_index()
    temp.fillna(0, inplace=True)
    
    return temp

user_session_counts = get_session_counts(df=user_sessions.copy())

user_session_counts

In [None]:
print("Getting the First and last session of the from and to user in the interaction swipe dataset")

def first_last_session_year(df):
    temp = df.groupby(['user_id']).agg(first_session_year = ('year', 'min'),
                                       last_session_year = ('year', 'max')).reset_index()
    temp['session_year_difference'] = temp.last_session_year - temp.first_session_year
    temp.columns = ['user_id', 'first_session_year', 'last_session_year', 'session_year_difference']
    return temp

user_first_last_session_year = first_last_session_year(df=user_sessions.copy())

user_first_last_session_year

In [None]:
print("Getting the total swipe days of the from and to user in the interaction swipe dataset")

def total_session_days(df):
    temp = df.groupby(['user_id', 'year', 'month']).agg(unique_days = ('day', 'nunique')).reset_index()
    temp = temp.groupby(['user_id']).agg(total_swipe_days = ('unique_days', 'sum')).reset_index()
    temp.columns = ['user_id', 'total_session_days']
    return temp

user_total_session_days = total_session_days(df=user_sessions.copy())

user_total_session_days

In [None]:
def session_time_stats(df):
    
    # Sorting it by form and timestamp
    df = df.sort_values(['user_id', 'timestamp'])
    df['lag_time'] = df.groupby(['user_id']).timestamp.shift(1)

    df['time_since_last_session'] = df.timestamp - df.lag_time
    df['time_since_last_session'] = df['time_since_last_session'].dt.seconds
    df['time_since_last_session'] = df['time_since_last_session'].fillna(0)
    df.drop(['lag_time'], inplace=True, axis=1)

    df = df.groupby(['user_id']).agg(mean_session_time=('time_since_last_session', 'mean'),
                                     median_session_time=('time_since_last_session', 'median'),
                                     max_session_time=('time_since_last_session', 'max')).reset_index()
    
    return df

user_session_stats = session_time_stats(df=user_sessions.copy())
user_session_stats

### User skills

In [None]:
def user_skills_preprocess(df):
    print(df.shape)
    df = df.loc[~df.duplicated(),]
    print(df.shape)
    df = df.groupby(['user_id']).agg(total_skills = ('skill_id', 'count')).reset_index()
    
    return df
    
user_skills_v2=user_skills_preprocess(df=user_skills)
user_skills_v2

### User strength

In [None]:
def user_strength_preprocess(df):
    df['total_strength_votes'] = df.iloc[:, 1:].sum(axis=1)
    return df

user_strengths_v2 = user_strength_preprocess(df=user_strengths.copy())
user_strengths_v2

### user works

In [None]:
def user_works_preprocess(df):
    print(df.shape)
    df = df.sort_values(['user_id', 'company_id', 'industry_id', 'over_1000_employees']).reset_index(drop=True)
    df = df.loc[~df.duplicated(), ].reset_index(drop=True)
    
    dup_rec = df[['user_id', 'company_id', 'industry_id']].duplicated()
    null_index = df[dup_rec].index[df[dup_rec].over_1000_employees.isnull()]

    df = df.loc[~df.index.isin(null_index), :].reset_index(drop=True)    
    print(df.shape)
    
    df = df.groupby(['user_id']).agg(uniq_companies_worked=('company_id','nunique'),
                                     uniq_industries_worked=('industry_id','nunique'),
                                     total_compaies_employ_gr_1000=('over_1000_employees', 'sum')).reset_index()

    return df

user_works_v2 = user_works_preprocess(df=user_works.copy())
user_works_v2.tail(15)

In [None]:
print("Age:", user_ages.shape, 
      "Education:",  user_educations_v2.shape,
      "Purpose:", user_purpose_v2.shape, 
      "User skills:", user_skills_v2.shape, 
      "Strength:",user_strengths_v2.shape, 
      "Works:", user_works_v2.shape, #user_self_intro.shape, user_sessions.shape
     )

# Merging all the users data

In [None]:
print("Merging the age and education")
user_master_df = pd.merge(user_ages, user_educations_v2, left_on=['user_id'],right_on=['user_id'], how='outer')
print(user_master_df.shape)
print("")

print("Merging the purpose")
user_master_df = pd.merge(user_master_df, user_purpose_v2, left_on=['user_id'],right_on=['user_id'], how='outer')
print(user_master_df.shape)
print("")

print("Merging the User Session age")
user_master_df = pd.merge(user_master_df, user_session_age_in_app, left_on=['user_id'],right_on=['user_id'], how='outer')
print(user_master_df.shape)
print("")

print("Merging the Session count")
user_master_df = pd.merge(user_master_df, user_session_counts, left_on=['user_id'],right_on=['user_id'], how='outer')
print(user_master_df.shape)
print("")


print("Merging the Session first last year")
user_master_df = pd.merge(user_master_df, user_first_last_session_year, left_on=['user_id'],right_on=['user_id'], how='outer')
print(user_master_df.shape)
print("")

print("Merging the Session unique days")
user_master_df = pd.merge(user_master_df, user_total_session_days, left_on=['user_id'],right_on=['user_id'], how='outer')
print(user_master_df.shape)
print("")

print("Merging the Session time stats")
user_master_df = pd.merge(user_master_df, user_session_stats, left_on=['user_id'],right_on=['user_id'], how='outer')
print(user_master_df.shape)
print("")

print("Merging the skills")
user_master_df = pd.merge(user_master_df, user_skills_v2, left_on=['user_id'],right_on=['user_id'], how='outer')
print(user_master_df.shape)
print("")

print("Merging the strength")
user_master_df = pd.merge(user_master_df, user_strengths_v2, left_on=['user_id'],right_on=['user_id'], how='outer')
print(user_master_df.shape)
print("")

print("Merging the user work")
user_master_df = pd.merge(user_master_df, user_works_v2, left_on=['user_id'],right_on=['user_id'], how='outer')
print(user_master_df.shape)
print("")

print("Merging the user intro")
user_master_df = pd.merge(user_master_df, user_self_intro, left_on=['user_id'],right_on=['user_id'], how='outer')
print(user_master_df.shape)


# Merging the user intro
# (46774, 345)

# Merging all the data to train and test prod 

In [None]:
# Combining the Interaction swipe stats with train and test

print(train_prod.shape, test_prod.shape)

train_prod = train_prod.merge(to_stats, left_on='to', right_on='to', how='left')
train_prod = train_prod.merge(from_stats, left_on='from', right_on='from', how='left')

test_prod = test_prod.merge(to_stats, left_on='to', right_on='to', how='left')
test_prod = test_prod.merge(from_stats, left_on='from', right_on='from', how='left')
print(train_prod.shape, test_prod.shape)

# Combining all interaction swipe count between the same from-to user to the train and test set
train_prod = train_prod.merge(interaction_from_to_swipe_counts, left_on = ['from', 'to'], right_on = ['from', 'to'], how='left')
test_prod = test_prod.merge(interaction_from_to_swipe_counts, left_on = ['from', 'to'], right_on = ['from', 'to'], how='left')

# Combining the interaction_review_strength
train_prod = train_prod.merge(interaction_review_strength, left_on = 'from-to', right_on='from-to', how='left')
test_prod = test_prod.merge(interaction_review_strength, left_on = 'from-to', right_on='from-to', how='left')
print(train_prod.shape, test_prod.shape)

# #Combining the interaction review comments
train_prod = train_prod.merge(interaction_review_comments, left_on = 'from-to', right_on='from-to', how='left')
test_prod = test_prod.merge(interaction_review_comments, left_on = 'from-to', right_on='from-to', how='left')

In [None]:
from_user_data = user_master_df.copy()
from_user_data = from_user_data.add_prefix("from_")

to_user_data = user_master_df.copy()
to_user_data = to_user_data.add_prefix("to_")

In [None]:
train_prod = train_prod.merge(from_user_data, left_on = 'from', right_on='from_user_id', how='left')
print(train_prod.shape)
train_prod = train_prod.merge(to_user_data, left_on = 'to', right_on='to_user_id', how='left')
print(train_prod.shape)

In [None]:
test_prod = test_prod.merge(from_user_data, left_on = 'from', right_on='from_user_id', how='left')
print(test_prod.shape)
test_prod = test_prod.merge(to_user_data, left_on = 'to', right_on='to_user_id', how='left')
print(test_prod.shape)

In [None]:
train_prod.drop(['from_user_id','to_user_id'], axis=1, inplace=True)
print(train_prod.shape)

test_prod.drop(['from_user_id','to_user_id'], axis=1, inplace=True)
print(test_prod.shape)


In [None]:
def get_cosine_similarity(user_df, features_df, selected_columns):
    similairty = []
    from_to=[]
    for i, dataframe in tqdm(user_df[['from-to', 'from', 'to']].iterrows(), total=len(user_df)):
        #print(i)
        try:
            from_user = dataframe['from']
            to_user = dataframe['to']
            #print(from_user, to_user)

            temp = features_df.loc[features_df.user_id.isin([from_user, to_user]), selected_columns].copy()
            #print(temp.shape)
            similarity_score = cosine_similarity(temp)[0, 1]

            from_to.append(dataframe['from-to'])
            similairty.append(similarity_score)
    
        except (IndexError, ValueError) as error:
            #print("These from-to users are not present in the self intro:", from_user, to_user)
            #print("error message:", error)
            from_to.append(dataframe['from-to'])
            similairty.append(0)
                
    similairy_df = pd.DataFrame({'from-to':from_to, 
                                  'cosine_similarity':similairty})
    return similairy_df 


In [None]:
selected_cols = user_self_intro.columns[user_self_intro.columns.str.contains('features_df')]

train_prod_user_intro_cosine_similarity = get_cosine_similarity(user_df=train_prod, features_df=user_self_intro, selected_columns=selected_cols)
print("")
test_prod_user_intro_cosine_similarity = get_cosine_similarity(user_df=test_prod, features_df=user_self_intro, selected_columns=selected_cols)

train_prod_user_intro_cosine_similarity.to_csv("../data/train_prod_user_intro_cosine_similarity.csv", index=False)
test_prod_user_intro_cosine_similarity.to_csv("../data/test_prod_user_intro_cosine_similarity.csv", index=False)

In [None]:
train_prod_user_intro_cosine_similarity = pd.read_csv("../data/train_prod_user_intro_cosine_similarity.csv")
test_prod_user_intro_cosine_similarity = pd.read_csv("../data/test_prod_user_intro_cosine_similarity.csv")

In [None]:
train_prod = train_prod.merge(train_prod_user_intro_cosine_similarity, 
                              left_on = 'from-to',
                              right_on = 'from-to', how='left')
test_prod = test_prod.merge(test_prod_user_intro_cosine_similarity, 
                            left_on = 'from-to',
                            right_on = 'from-to', how='left')

train_prod.rename(columns={'cosine_similarity':'user_intro_cosine_similarity'}, inplace=True)
test_prod.rename(columns={'cosine_similarity':'user_intro_cosine_similarity'}, inplace=True)

print(train_prod.shape)
print(test_prod.shape)

In [None]:
selected_cols = selected_cols = user_purpose.columns[user_purpose.columns.str.contains('purpose_id_')]

print("Running the train similarity")
train_prod_purpose_cosine_similarity = get_cosine_similarity(user_df=train_prod, features_df=user_purpose, selected_columns=selected_cols)
print("")
print("Running the test similarity")
test_prod_purpose_cosine_similarity = get_cosine_similarity(user_df=test_prod, features_df=user_purpose, selected_columns=selected_cols)

train_prod_purpose_cosine_similarity.to_csv("../data/train_prod_purpose_cosine_similarity.csv", index=False)
test_prod_purpose_cosine_similarity.to_csv("../data/test_prod_purpose_cosine_similarity.csv", index=False)

In [None]:
train_prod_purpose_cosine_similarity = pd.read_csv("../data/train_prod_purpose_cosine_similarity.csv")
test_prod_purpose_cosine_similarity = pd.read_csv("../data/test_prod_purpose_cosine_similarity.csv")

train_prod = train_prod.merge(train_prod_purpose_cosine_similarity, 
                              left_on = 'from-to',
                              right_on = 'from-to', how='left')
test_prod = test_prod.merge(test_prod_purpose_cosine_similarity, 
                            left_on = 'from-to',
                            right_on = 'from-to', how='left')

train_prod.rename(columns={'cosine_similarity':'user_purpose_cosine_similarity'}, inplace=True)
test_prod.rename(columns={'cosine_similarity':'user_purpose_cosine_similarity'}, inplace=True)

print(train_prod.shape)
print(test_prod.shape)

In [None]:
interaction_review_comments[['from', 'to']] = interaction_review_comments['from-to'].str.split("-", expand=True).astype('int')
rev_columns = interaction_review_comments.columns[interaction_review_comments.columns.str.contains("review_comment_")].tolist()
interaction_review_comments[['from-to', 'from', 'to'] + rev_columns]

In [None]:
def review_comment_count(df, side):
    '''
    This function is about gettting the count of total reviews provided by "from".
    Likewise how many reviews were received by "to".
    '''
    temp = df.groupby([side]).agg(review_comments_count = (side, 'count')).reset_index()
    temp.columns = [side, side+"_review_comments_count"]

    return temp

from_review_count = review_comment_count(df=interaction_review_comments, side='from')
to_review_count = review_comment_count(df=interaction_review_comments, side='to')


In [None]:
print("Before merge", train_prod.shape, test_prod.shape)
train_prod = train_prod.merge(from_review_count, 
                             left_on = 'from', right_on = 'from',
                             how='left')
train_prod = train_prod.merge(to_review_count, 
                             left_on = 'to', right_on = 'to',
                             how='left')

test_prod = test_prod.merge(from_review_count, 
                             left_on = 'from', right_on = 'from',
                             how='left')
test_prod = test_prod.merge(to_review_count, 
                             left_on = 'to', right_on = 'to',
                             how='left')

print("After merge",train_prod.shape, test_prod.shape)


In [None]:
def get_common_purpose_count(user_df, purpose_df, purpose_columns):
    from_to=[]
    common_purpose=[]
    
    common_interested_purpose=[]
    common_uninterested_purpose=[]
    for i, dataframe in tqdm(user_df[['from-to', 'from', 'to']].iterrows(), total=len(user_df)):

        from_user = dataframe['from']
        to_user = dataframe['to']

        from_temp = purpose_df.loc[purpose_df.user_id.isin([from_user]), purpose_columns].reset_index(drop=True)
        to_temp = purpose_df.loc[purpose_df.user_id.isin([to_user]), purpose_columns].reset_index(drop=True)
        
        if ((from_temp.shape[0] != 0) & (to_temp.shape[0] != 0)):
            
            match_from_to_purpose = (from_temp + to_temp)
            
            match_same_interest_purpose = (match_from_to_purpose==2)
            common_interest_purpose_count = match_same_interest_purpose.sum(axis=1).values[0]

            match_same_uninterest_purpose = (match_from_to_purpose==0)
            common_uninterest_purpose_count = match_same_uninterest_purpose.sum(axis=1).values[0]
            
            from_to.append(dataframe['from-to'])        
            common_interested_purpose.append(common_interest_purpose_count)
            common_uninterested_purpose.append(common_uninterest_purpose_count)
            
        else:
            from_to.append(dataframe['from-to'])        
            common_interested_purpose.append(-999)
            common_uninterested_purpose.append(-999)

    purpose_df = pd.DataFrame({'from-to': from_to, 
                               'common_interested_purpose': common_interested_purpose,
                               'common_uninterested_purpose': common_uninterested_purpose,})
    
    return purpose_df


In [None]:
sel_columns = user_purpose.columns[user_purpose.columns.str.contains('purpose')]

train_purpose_match_df = get_common_purpose_count(user_df=train_prod.copy(), purpose_df=user_purpose.copy(), purpose_columns=sel_columns)
test_purpose_match_df = get_common_purpose_count(user_df=test_prod.copy(), purpose_df=user_purpose.copy(), purpose_columns=sel_columns)

train_purpose_match_df.to_csv("../data/train_purpose_match_df.csv", index=False)
test_purpose_match_df.to_csv("../data/test_purpose_match_df.csv", index=False)

In [None]:
train_purpose_match_df = pd.read_csv("../data/train_purpose_match_df.csv")
test_purpose_match_df = pd.read_csv("../data/test_purpose_match_df.csv")

train_prod = train_prod.merge(train_purpose_match_df, on='from-to', how='left')
test_prod = test_prod.merge(test_purpose_match_df, on='from-to', how='left')

In [None]:
def common_users_swiped(interaction, df):
    
    interaction_right = interaction.loc[interaction.swipe_status==1, :].copy()
    interaction_left  = interaction.loc[interaction.swipe_status==-1, :].copy()
    
    from_swiped_right_users =  dict(interaction_right.groupby('from')['to'].apply(list))
    from_swiped_left_users =  dict(interaction_left.groupby('from')['to'].apply(list))
    
    #print(from_swiped_right_users)
    #print(from_swiped_left_users)
    
    common_swipes_right=[]
    common_swipes_left=[]
    common_from_left_to_right_swiped=[]
    common_from_right_to_left_swiped=[]
    from_to_list = []
    
    for i, dataframe in tqdm(df[['from-to', 'from', 'to']].iterrows(), total=len(df)):
        from_to = dataframe['from-to']
        from_user = dataframe['from']
        to_user = dataframe['to']
        
        try:
            common_users_right_swiped = len(np.intersect1d(from_swiped_right_users[from_user], from_swiped_right_users[to_user]))
            common_swipes_right.append(common_users_right_swiped)
        except KeyError:
            common_swipes_right.append(0)
        
        try:
            common_users_left_swiped = len(np.intersect1d(from_swiped_left_users[from_user], from_swiped_left_users[to_user]))
            common_swipes_left.append(common_users_left_swiped)
        except KeyError:
            common_swipes_left.append(0)
        
        try:
            common_users_from_left_to_right_swiped = len(np.intersect1d(from_swiped_left_users[from_user], from_swiped_right_users[to_user]))
            common_from_left_to_right_swiped.append(common_users_from_left_to_right_swiped)
        except KeyError:
            common_from_left_to_right_swiped.append(0)
        
        try:
            common_users_from_right_to_left_swiped = len(np.intersect1d(from_swiped_right_users[from_user], from_swiped_left_users[to_user]))
            common_from_right_to_left_swiped.append(common_users_from_right_to_left_swiped)
        except KeyError:
            common_from_right_to_left_swiped.append(0)
        
        from_to_list.append(from_to)

    common_swipes = pd.DataFrame({'from-to': from_to_list, 
                                  'common_users_swiped_right': common_swipes_right,
                                  'common_users_swiped_left': common_swipes_left,
                                  'common_users_from_left_to_right_swiped': common_from_left_to_right_swiped,
                                  'common_users_from_right_to_left_swiped': common_from_right_to_left_swiped})

    
    return common_swipes

In [None]:
small_interaction_df = interaction_swipe[['from', 'to', 'swipe_status']].drop_duplicates().copy()

train_common_users_swiped = common_users_swiped(interaction=small_interaction_df.copy(), df = train_prod.copy())
test_common_users_swiped = common_users_swiped(interaction= small_interaction_df.copy(), df = test_prod.copy())

train_common_users_swiped.to_csv("../data/train_common_users_swiped_v2.csv", index=False)
test_common_users_swiped.to_csv("../data/test_common_users_swiped_v2.csv", index=False)

In [None]:
train_common_users_swiped = pd.read_csv("../data/train_common_users_swiped_v2.csv")
test_common_users_swiped = pd.read_csv("../data/test_common_users_swiped_v2.csv")

print(train_prod.shape, test_prod.shape)
train_prod = train_prod.merge(train_common_users_swiped, on = 'from-to', how='left')
test_prod = test_prod.merge(test_common_users_swiped, on = 'from-to', how='left')
print(train_prod.shape, test_prod.shape)

In [None]:
def common_users_session_time(sessions, df):
    from_to_list=[]
    
    same_day_session_count = []
    same_hour_session_count = []
    for i, dataframe in tqdm(df[['from-to', 'from', 'to']].iterrows(), total=len(df)):
        from_to = dataframe['from-to']
        from_user = dataframe['from']
        to_user = dataframe['to']
        
        sessions_small = sessions.loc[sessions.user_id.isin([from_user, to_user]), :].copy()
        
        col = ['year', 'month', 'day']
        same_day_login_count = sessions_small[['user_id'] + col].drop_duplicates().copy()
        same_day_login_count = same_day_login_count[col].duplicated().sum()
        
        col = ['year', 'month', 'day', 'hour']
        same_hour_login_count = sessions_small[['user_id'] + col].drop_duplicates().copy()
        same_hour_login_count = same_hour_login_count[col].duplicated().sum()
        
        same_day_session_count.append(same_day_login_count)
        same_hour_session_count.append(same_hour_login_count)
        from_to_list.append(from_to)
        
    common_user_session = pd.DataFrame({'from-to': from_to_list, 
                                        'same_day_session_count':same_day_session_count,
                                        'same_hour_session_count': same_hour_session_count}
                                       )

    return common_user_session

In [None]:
print("User session shape", user_sessions.shape)
uniq_user_sessions = user_sessions[['user_id', 'year', 'month', 'day', 'hour']].drop_duplicates().copy()
print("Unique User session shape", uniq_user_sessions.shape)

train_prod_common_sessions = common_users_session_time(sessions=uniq_user_sessions.copy(), df=train_prod.loc[0:100, :].copy())
test_prod_common_sessions = common_users_session_time(sessions=uniq_user_sessions.copy(), df=test_prod.loc[0:10, :].copy())

train_prod_common_sessions.to_csv("../data/train_prod_common_sessions.csv", index=False)
test_prod_common_sessions.to_csv("../data/test_prod_common_sessions.csv", index=False)

In [None]:
train_prod_common_sessions = pd.read_csv("../data/train_prod_common_sessions.csv")
test_prod_common_sessions = pd.read_csv("../data/test_prod_common_sessions.csv")

print(train_prod.shape, test_prod.shape)
train_prod = train_prod.merge(train_prod_common_sessions, on = 'from-to', how='left')
test_prod = test_prod.merge(test_prod_common_sessions, on = 'from-to', how='left')
print(train_prod.shape, test_prod.shape)

In [None]:
def get_common_purpose(user_df, purpose_df, purpose_columns):
    from_to=[]

    common_interested_purpose_array = np.empty(shape=(0, 15))
    common_uninterested_purpose_array = np.empty(shape=(0, 15))
    uncommon_interest_array = np.empty(shape=(0, 15))
    no_user_data = np.zeros(shape=(1, 15))
    
    for i, dataframe in tqdm(user_df[['from-to', 'from', 'to']].iterrows(), total=len(user_df)):

        from_to.append(dataframe['from-to'])
        from_user = dataframe['from']
        to_user = dataframe['to']

        from_temp = purpose_df.loc[purpose_df.user_id.isin([from_user]), purpose_columns].reset_index(drop=True)
        to_temp = purpose_df.loc[purpose_df.user_id.isin([to_user]), purpose_columns].reset_index(drop=True)
        
        if ((from_temp.shape[0] != 0) & (to_temp.shape[0] != 0)):            
            match_from_to_purpose = (from_temp + to_temp)
            
            temp = (match_from_to_purpose==2)
            common_interested_purpose_array = np.append(common_interested_purpose_array, temp, axis=0)
            
            temp = (match_from_to_purpose==0)
            common_uninterested_purpose_array = np.append(common_uninterested_purpose_array, temp, axis=0)
            
            temp = (match_from_to_purpose==1)
            uncommon_interest_array = np.append(uncommon_interest_array, temp, axis=0)
              
        else:
            common_interested_purpose_array = np.append(common_interested_purpose_array, no_user_data, axis=0)
            common_uninterested_purpose_array = np.append(common_uninterested_purpose_array, no_user_data, axis=0)
            uncommon_interest_array = np.append(uncommon_interest_array, no_user_data, axis=0)
            
    common_interest_purpose_columns = ["common_interest_purpose_"+str(i) for i in range(1, 16)]
    common_interested_purpose_df = pd.DataFrame(common_interested_purpose_array, columns=common_interest_purpose_columns)
    common_interested_purpose_df['from-to'] = from_to
    common_interested_purpose_df['common_interest_purpose_count'] = common_interested_purpose_df[common_interest_purpose_columns].sum(axis=1)
    common_interested_purpose_df = common_interested_purpose_df[['from-to'] + common_interest_purpose_columns+ ['common_interest_purpose_count']]
    
    common_uninterested_purpose_columns = ["common_uninterested_purpose_"+str(i) for i in range(1, 16)]
    common_uninterested_purpose_df = pd.DataFrame(common_uninterested_purpose_array, columns=common_uninterested_purpose_columns)
    common_uninterested_purpose_df['from-to'] = from_to
    common_uninterested_purpose_df['common_uninterest_purpose_count'] = common_uninterested_purpose_df[common_uninterested_purpose_columns].sum(axis=1)
    common_uninterested_purpose_df = common_uninterested_purpose_df[['from-to'] + common_uninterested_purpose_columns + ['common_uninterest_purpose_count']]
    
    uncommon_interested_purpose_columns = ["uncommon_interested_purpose_"+str(i) for i in range(1, 16)]
    uncommon_interested_purpose_df = pd.DataFrame(uncommon_interest_array, columns=uncommon_interested_purpose_columns)
    uncommon_interested_purpose_df['from-to'] = from_to
    uncommon_interested_purpose_df['uncommon_interested_purpose_count'] = uncommon_interested_purpose_df[uncommon_interested_purpose_columns].sum(axis=1)
    uncommon_interested_purpose_df = uncommon_interested_purpose_df[['from-to'] + uncommon_interested_purpose_columns + ['uncommon_interested_purpose_count']]
    
    
    return common_interested_purpose_df, common_uninterested_purpose_df, uncommon_interested_purpose_df


In [None]:
sel_columns = user_purpose.columns[user_purpose.columns.str.contains('purpose')]

train_common_interested_purpose_df, train_common_uninterested_purpose_df, train_uncommon_interested_purpose_df = get_common_purpose(user_df=train_prod.loc[0:10,:].copy(), 
                                                                                                                                    purpose_df=user_purpose, 
                                                                                                                                    purpose_columns=sel_columns)

test_common_interested_purpose_df, test_common_uninterested_purpose_df, test_uncommon_interested_purpose_df = get_common_purpose(user_df=test_prod.loc[0:10,:].copy(), 
                                                                                                                                 purpose_df=user_purpose, 
                                                                                                                                 purpose_columns=sel_columns)

train_common_interested_purpose_df.to_csv("../data/train_common_interested_purpose_df.csv", index=False)
train_common_uninterested_purpose_df.to_csv("../data/train_common_uninterested_purpose_df.csv", index=False)
train_uncommon_interested_purpose_df.to_csv("../data/train_uncommon_interested_purpose_df.csv", index=False)

test_common_interested_purpose_df.to_csv("../data/test_common_interested_purpose_df.csv", index=False)
test_common_uninterested_purpose_df.to_csv("../data/test_common_uninterested_purpose_df.csv", index=False)
test_uncommon_interested_purpose_df.to_csv("../data/test_uncommon_interested_purpose_df.csv", index=False)

In [None]:
train_common_interested_purpose_df = pd.read_csv("../data/train_common_interested_purpose_df.csv")
train_common_uninterested_purpose_df = pd.read_csv("../data/train_common_uninterested_purpose_df.csv")
train_uncommon_interested_purpose_df = pd.read_csv("../data/train_uncommon_interested_purpose_df.csv")

test_common_interested_purpose_df = pd.read_csv("../data/test_common_interested_purpose_df.csv")
test_common_uninterested_purpose_df = pd.read_csv("../data/test_common_uninterested_purpose_df.csv")
test_uncommon_interested_purpose_df = pd.read_csv("../data/test_uncommon_interested_purpose_df.csv")

print(train_prod.shape)
train_prod = train_prod.merge(train_common_interested_purpose_df, on='from-to', how='left')
print(train_prod.shape)
train_prod = train_prod.merge(train_common_uninterested_purpose_df, on='from-to', how='left')
print(train_prod.shape)
train_prod = train_prod.merge(train_uncommon_interested_purpose_df, on='from-to', how='left')
print(train_prod.shape)

print(test_prod.shape)
test_prod = test_prod.merge(test_common_interested_purpose_df, on='from-to', how='left')
print(test_prod.shape)
test_prod = test_prod.merge(test_common_uninterested_purpose_df, on='from-to', how='left')
print(test_prod.shape)
test_prod = test_prod.merge(test_uncommon_interested_purpose_df, on='from-to', how='left')
print(test_prod.shape)

In [None]:
def common_users_strength_vote_diff(strength, df):
    
    strength_col = strength.columns[strength.columns.str.contains('strength')]
    
    from_to_list = []
    strength_diff= np.empty(shape=(0, 8))
    for i, dataframe in tqdm(df[['from-to', 'from', 'to']].iterrows(), total=len(df)):
        from_to = dataframe['from-to']
        from_user = dataframe['from']
        to_user = dataframe['to']
        
        from_strength = strength.loc[strength.user_id==from_user, strength_col].reset_index(drop=True)
        to_strength = strength.loc[strength.user_id==to_user, strength_col].reset_index(drop=True)
        
        if (from_strength.shape[0] == 0 and to_strength.shape[0] == 0):
            strength_diff = np.append(strength_diff, np.array([0]*8).reshape(1, -1), axis=0)
        elif (from_strength.shape[0] == 0):
            temp = (np.array([0]*8) - to_strength).values
            strength_diff = np.append(strength_diff, temp.reshape(1,-1), axis=0)
        elif (to_strength.shape[0] == 0):
            temp = (from_strength - np.array([0]*8)).values
            strength_diff = np.append(strength_diff, temp.reshape(1,-1), axis=0)        
        else:                    
            temp = (from_strength - to_strength).values
            strength_diff = np.append(strength_diff, temp.reshape(1,-1), axis=0)
                
        from_to_list.append(from_to)
    
    cols=['common_strength_id'+str(i+1) for i in range(len(strength_col))]
    strength_diff_df = pd.DataFrame(strength_diff, columns=cols)
    strength_diff_df['from-to'] = from_to_list
    
    strength_diff_df = strength_diff_df[['from-to']+cols]
    
    return strength_diff_df


In [None]:
train_prod_strength_diff = common_users_strength_vote_diff(strength=user_strengths, df=train_prod)
test_prod_strength_diff = common_users_strength_vote_diff(strength=user_strengths, df=test_prod)

train_prod_strength_diff.to_csv("../data/train_prod_strength_diff.csv", index=False)
test_prod_strength_diff.to_csv("../data/test_prod_strength_diff.csv", index=False)

In [None]:
train_prod_strength_diff = pd.read_csv("../data/train_prod_strength_diff.csv")
test_prod_strength_diff = pd.read_csv("../data/test_prod_strength_diff.csv")

print(train_prod.shape, test_prod.shape)
train_prod = train_prod.merge(train_prod_strength_diff, on='from-to', how='left')
test_prod = test_prod.merge(test_prod_strength_diff, on='from-to', how='left')

train_prod.shape, test_prod.shape

In [None]:
print("Getting the total swipe days of the from and to user in the interaction swipe dataset")

def total_swipes_per_day_stats(df, side):
    temp = df.groupby([side, 'year', 'month', 'day', 'swipe_status']).agg(total_swipes_per_day = ('swipe_status', 'count')).reset_index()
    temp = temp.groupby([side, 'swipe_status']).agg(mean_swipes_per_day=('total_swipes_per_day', 'mean'),
                                               median_swipes_per_day=('total_swipes_per_day', 'median'),
                                               max_swipes_per_day=('total_swipes_per_day', 'max'),
                                               min_swipes_per_day=('total_swipes_per_day', 'min')).reset_index()
    temp = temp.pivot(index=side, columns='swipe_status', 
                      values=['mean_swipes_per_day', 'median_swipes_per_day',
                              'max_swipes_per_day', 'min_swipes_per_day']).reset_index().fillna(0)
    
    temp.columns = [side, 'left_swipe_mean_swipes_per_day','right_swipe_mean_swipes_per_day',
                    'left_swipe_median_swipes_per_day','right_swipe_median_swipes_per_day',
                    'left_swipe_max_swipes_per_day','right_swipe_max_swipes_per_day',
                    'left_swipe_min_swipes_per_day', 'right_swipe_min_swipes_per_day']
    return temp

from_total_swipe_days = total_swipes_per_day_stats(df=interaction_swipe.copy(), side='from')
to_total_swipe_days = total_swipes_per_day_stats(df=interaction_swipe.copy(), side='to')

In [None]:
from_total_swipe_days.to_csv('../data/from_total_swipe_days.csv', index=False)
to_total_swipe_days.to_csv('../data/to_total_swipe_days.csv', index=False)

In [None]:
gc.collect()

train_prod = pd.read_pickle("../data/train_prod_v16.pickle")
test_prod = pd.read_pickle("../data/test_prod_v16.pickle")


In [None]:
from_total_swipe_days = pd.read_csv('../data/from_total_swipe_days.csv')
to_total_swipe_days = pd.read_csv('../data/to_total_swipe_days.csv')

In [None]:
print(train_prod.shape, test_prod.shape)
train_prod = train_prod.merge(from_total_swipe_days, on='from', how='left')
train_prod = train_prod.merge(to_total_swipe_days, on='to', how='left')

test_prod = test_prod.merge(from_total_swipe_days, on='from', how='left')
test_prod = test_prod.merge(to_total_swipe_days, on='to', how='left')


train_prod.shape, test_prod.shape

In [None]:
train_prod.to_pickle("../data/train_prod_v17.pickle")
test_prod.to_pickle("../data/test_prod_v17.pickle")