In [None]:
from google.colab import drive
drive.mount ('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, roc_curve, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay, balanced_accuracy_score
from sklearn.linear_model import LogisticRegression

# load data

In [None]:
df_profiles = pd.read_csv('drive/MyDrive/recsys_comp/profiles.csv')
print(df_profiles.shape)
df_profiles.head(6)

(4904, 11)


Unnamed: 0,id,is_closed,city,sex,followers_count,schools,schools_years_grad,universities,universities_years_grad,faculties,careers
0,1119,False,2.0,2,1256.0,,,,,,
1,1127,False,2.0,2,2738.0,"['12', '3447', '59']","[2005, None, None]","[53, 53, 53]","[2011, 2014, 2012]","[201, 201, 169574]","[None, None, None, None, None, None, None]"
2,2619,True,2.0,2,,"['54', '1319']","[2005, None]","[1, 29, 1]","[2010, 2011, 2013]","[7, 1815, 7]",[]
3,2767,False,2.0,2,446.0,"['12', '6972', '969864']","[2005, 2001, 1996]",[53],[2011],[201],[]
4,4023,False,2.0,2,462.0,"['54054', '11301']","[2002, 2002]","[53, 53]","[2008, 2011]","[202, 208]","[94, 94, None]"
5,18239,False,2.0,2,1887.0,['1739'],[None],[53],[None],[202],"[94, 76139618]"


In [None]:
df_posts = pd.read_csv('drive/MyDrive/recsys_comp/posts.csv')
print(df_posts.shape)
df_posts.head(6)

(43979, 7)


Unnamed: 0,comments_count,attachments_count,views,likes,author_id,post_type,post_dttm
0,1,0,,0.0,1119,post,1685912400
1,2,0,1402.0,24.0,1119,post,1664053200
2,3,0,852.0,8.0,1119,post,1647205200
3,7,1,1390.0,24.0,1119,post,1630702800
4,7,2,1819.0,14.0,1119,post,1612299600
5,3,0,741.0,9.0,1119,post,1611090000


In [None]:
df_friends = pd.read_csv('drive/MyDrive/recsys_comp/friends.csv')
print(df_friends.shape)
df_friends.head(3)

(953179, 2)


Unnamed: 0,user1,user2
0,1119,9216
1,1119,3580931
2,1119,6055941


In [None]:
df_link = pd.read_csv('drive/MyDrive/recsys_comp/links_train.csv')
df_link

Unnamed: 0,user1,user2,is_friends
0,82530889,157814259,0
1,104274145,669799766,0
2,227438304,268462283,0
3,782512,66408174,0
4,164821380,188883374,0
...,...,...,...
6356749,151713163,349490510,0
6356750,111420790,214609156,0
6356751,29149621,293027231,0
6356752,18239,251384889,0


In [None]:
df_test = pd.read_csv('drive/MyDrive/recsys_comp/links_test.csv')
df_test

Unnamed: 0,ID,user1,user2
0,0,426512423,767230291
1,1,127744839,589283004
2,2,76324543,189101836
3,3,79828756,272293999
4,4,88063046,280460807
...,...,...,...
706402,706402,266088,172683390
706403,706403,158366680,181474112
706404,706404,50956059,82110465
706405,706405,137877629,348218549


In [None]:
df_1 = df_link.drop(columns=('is_friends'))

In [None]:
ds_full = pd.concat([df_1, df_test], ignore_index=True)
len(ds_full)

7063161

In [None]:
df_train = ds_full.merge(df_profiles, how='left', left_on='user1', right_on='id')
df_train = df_train.merge(df_profiles,  how='left', left_on='user2', right_on='id', suffixes=('_1', '_2'))
df_train.head()

Unnamed: 0,user1,user2,ID,id_1,is_closed_1,city_1,sex_1,followers_count_1,schools_1,schools_years_grad_1,...,is_closed_2,city_2,sex_2,followers_count_2,schools_2,schools_years_grad_2,universities_2,universities_years_grad_2,faculties_2,careers_2
0,82530889,157814259,,82530889,False,,2,313.0,,,...,False,2.0,2,371.0,"['1428', '7307']","[None, 2020]",[17],[None],[78],[]
1,104274145,669799766,,104274145,False,,2,706.0,[],[],...,False,,1,201.0,[],[],[],[],[],[]
2,227438304,268462283,,227438304,False,2.0,1,156.0,[],[],...,False,,2,207.0,[],[],[53],[None],[None],[]
3,782512,66408174,,782512,False,2.0,2,213.0,"['52067', '248414']","[2000, 2000]",...,False,2.0,2,252.0,,,,,,
4,164821380,188883374,,164821380,False,,2,81.0,['251150'],[None],...,False,2.0,2,329.0,,,,,,


# make features

In [None]:
user2friends = df_friends.groupby('user1').user2.apply(set).to_dict()

In [None]:
import ast

def calc_same_schools(row):
    schools_1 = ast.literal_eval(row['schools_1']) if pd.notna(row['schools_1']) else []
    schools_2 = ast.literal_eval(row['schools_2']) if pd.notna(row['schools_2']) else []

    schools_1 = set(schools_1) if isinstance(schools_1, list) else set()
    schools_2 = set(schools_2) if isinstance(schools_2, list) else set()

    return len(schools_1 & schools_2)

def calc_same_universities(row):
    universities_1 = ast.literal_eval(row['universities_1']) if pd.notna(row['universities_1']) else []
    universities_2 = ast.literal_eval(row['universities_2']) if pd.notna(row['universities_2']) else []

    universities_1 = set(universities_1) if isinstance(universities_1, list) else set()
    universities_2 = set(universities_2) if isinstance(universities_2, list) else set()

    return len(universities_1 & universities_2)

def calc_same_faculties(row):
    faculties_1 = ast.literal_eval(row['faculties_1']) if pd.notna(row['faculties_1']) else []
    faculties_2 = ast.literal_eval(row['faculties_2']) if pd.notna(row['faculties_2']) else []

    faculties_1 = set(faculties_1) if isinstance(faculties_1, list) else set()
    faculties_2 = set(faculties_2) if isinstance(faculties_2, list) else set()

    return len(faculties_1 & faculties_2)

def calc_same_careers(row):
    careers_1 = ast.literal_eval(row['careers_1']) if pd.notna(row['careers_1']) else []
    careers_2 = ast.literal_eval(row['careers_2']) if pd.notna(row['careers_2']) else []

    careers_1 = set(careers_1) if isinstance(careers_1, list) else set()
    careers_2 = set(careers_2) if isinstance(careers_2, list) else set()

    return len(careers_1 & careers_2)

df_train['count_same_schools'] = df_train.apply(calc_same_schools, axis=1)
df_train['count_same_universities'] = df_train.apply(calc_same_universities, axis=1)
df_train['count_same_faculties'] = df_train.apply(calc_same_faculties, axis=1)
df_train['count_same_careers'] = df_train.apply(calc_same_careers, axis=1)


In [None]:
df_train

Unnamed: 0,user1,user2,ID,id_1,is_closed_1,city_1,sex_1,followers_count_1,schools_1,schools_years_grad_1,...,schools_2,schools_years_grad_2,universities_2,universities_years_grad_2,faculties_2,careers_2,count_same_schools,count_same_universities,count_same_faculties,count_same_careers
0,82530889,157814259,,82530889,False,,2,313.0,,,...,"['1428', '7307']","[None, 2020]",[17],[None],[78],[],0,0,0,0
1,104274145,669799766,,104274145,False,,2,706.0,[],[],...,[],[],[],[],[],[],0,0,0,0
2,227438304,268462283,,227438304,False,2.0,1,156.0,[],[],...,[],[],[53],[None],[None],[],0,1,1,0
3,782512,66408174,,782512,False,2.0,2,213.0,"['52067', '248414']","[2000, 2000]",...,,,,,,,0,0,0,0
4,164821380,188883374,,164821380,False,,2,81.0,['251150'],[None],...,,,,,,,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7063156,266088,172683390,706402.0,266088,False,1.0,2,1627.0,,,...,['646'],[None],[53],[None],[2268355],[37784625],0,0,0,0
7063157,158366680,181474112,706403.0,158366680,False,2.0,1,220.0,,,...,['113495'],[None],"[729, 53]","[2021, None]","[3181, 2185900]",[],0,0,0,0
7063158,50956059,82110465,706404.0,50956059,False,2.0,2,492.0,,,...,[],[],[],[],[],[161665691],0,0,0,0
7063159,137877629,348218549,706405.0,137877629,False,2.0,2,205.0,,,...,['85962'],[2019],[],[],[],[],0,0,0,0


In [None]:
import networkx as nx

In [None]:
g_fr = nx.from_pandas_edgelist(df_friends, 'user1', 'user2')


In [None]:
common_friends = []
for user1, user2 in zip(df_train.user1, df_train.user2):
    try:
        common_friends.append(len(set(g_fr.neighbors(user1)) & set(g_fr.neighbors(user2))))
    except nx.NetworkXError:
        common_friends.append(0)

df_train['common_friends'] = common_friends


In [None]:
df_train

Unnamed: 0,user1,user2,ID,id_1,is_closed_1,city_1,sex_1,followers_count_1,schools_1,schools_years_grad_1,...,schools_years_grad_2,universities_2,universities_years_grad_2,faculties_2,careers_2,count_same_schools,count_same_universities,count_same_faculties,count_same_careers,common_friends
0,82530889,157814259,,82530889,False,,2,313.0,,,...,"[None, 2020]",[17],[None],[78],[],0,0,0,0,0
1,104274145,669799766,,104274145,False,,2,706.0,[],[],...,[],[],[],[],[],0,0,0,0,0
2,227438304,268462283,,227438304,False,2.0,1,156.0,[],[],...,[],[53],[None],[None],[],0,1,1,0,0
3,782512,66408174,,782512,False,2.0,2,213.0,"['52067', '248414']","[2000, 2000]",...,,,,,,0,0,0,0,0
4,164821380,188883374,,164821380,False,,2,81.0,['251150'],[None],...,,,,,,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7063156,266088,172683390,706402.0,266088,False,1.0,2,1627.0,,,...,[None],[53],[None],[2268355],[37784625],0,0,0,0,0
7063157,158366680,181474112,706403.0,158366680,False,2.0,1,220.0,,,...,[None],"[729, 53]","[2021, None]","[3181, 2185900]",[],0,0,0,0,0
7063158,50956059,82110465,706404.0,50956059,False,2.0,2,492.0,,,...,[],[],[],[],[161665691],0,0,0,0,0
7063159,137877629,348218549,706405.0,137877629,False,2.0,2,205.0,,,...,[2019],[],[],[],[],0,0,0,0,0


In [None]:
df_train['friends_user1'] = np.array(g_fr.degree(df_train.user1))[:, 1]
df_train['friends_user2'] = np.array(g_fr.degree(df_train.user2))[:, 1]


In [None]:
df_train['same_city'] = df_train['city_1'] == df_train['city_2']
df_train['same_city'] = df_train['same_city'].astype(int)

df_train['same_sex'] = df_train['sex_1'] == df_train['sex_2']
df_train['same_sex'] = df_train['same_sex'].astype(int)

In [None]:
def jaccard_coeff(user1, user2, g_fr):
    user1_nei = set(g_fr.neighbors(user1))
    user2_nei = set(g_fr.neighbors(user2))
    return len(user1_nei & user2_nei) / len(user1_nei | user2_nei)


df_train['jaccard_coeff'] = df_train.apply(lambda row: jaccard_coeff(row['user1'], row['user2'], g_fr), axis=1)


In [None]:
df_train

In [None]:
df_train.to_csv('drive/MyDrive/recsys_comp/trained_df.csv')

In [None]:
df_train = pd.read_csv('drive/MyDrive/recsys_comp/trained_df.csv')

In [None]:
df_train

Unnamed: 0.1,Unnamed: 0,user1,user2,ID,id_1,is_closed_1,city_1,sex_1,followers_count_1,schools_1,...,count_same_schools,count_same_universities,count_same_faculties,count_same_careers,common_friends,friends_user1,friends_user2,same_city,same_sex,jaccard_coeff
0,0,82530889,157814259,,82530889,False,,2,313.0,,...,0,0,0,0,0,135,336,0,1,0.0
1,1,104274145,669799766,,104274145,False,,2,706.0,[],...,0,0,0,0,0,387,194,0,0,0.0
2,2,227438304,268462283,,227438304,False,2.0,1,156.0,[],...,0,1,1,0,0,75,97,0,0,0.0
3,3,782512,66408174,,782512,False,2.0,2,213.0,"['52067', '248414']",...,0,0,0,0,0,175,171,1,1,0.0
4,4,164821380,188883374,,164821380,False,,2,81.0,['251150'],...,0,0,0,0,0,81,227,0,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7063156,7063156,266088,172683390,706402.0,266088,False,1.0,2,1627.0,,...,0,0,0,0,0,1183,220,0,0,0.0
7063157,7063157,158366680,181474112,706403.0,158366680,False,2.0,1,220.0,,...,0,0,0,0,0,104,102,1,0,0.0
7063158,7063158,50956059,82110465,706404.0,50956059,False,2.0,2,492.0,,...,0,0,0,0,0,400,16,1,1,0.0
7063159,7063159,137877629,348218549,706405.0,137877629,False,2.0,2,205.0,,...,0,0,0,0,0,173,26,0,1,0.0


# evaluate model

In [None]:
links_train = pd.read_csv('drive/MyDrive/recsys_comp/links_train.csv')
links_test = pd.read_csv('drive/MyDrive/recsys_comp/links_test.csv')

len(links_train), len(links_test)

(6356754, 706407)

In [None]:
df_train.columns

Index(['Unnamed: 0', 'user1', 'user2', 'ID', 'id_1', 'is_closed_1', 'city_1',
       'sex_1', 'followers_count_1', 'schools_1', 'schools_years_grad_1',
       'universities_1', 'universities_years_grad_1', 'faculties_1',
       'careers_1', 'id_2', 'is_closed_2', 'city_2', 'sex_2',
       'followers_count_2', 'schools_2', 'schools_years_grad_2',
       'universities_2', 'universities_years_grad_2', 'faculties_2',
       'careers_2', 'count_same_schools', 'count_same_universities',
       'count_same_faculties', 'count_same_careers', 'common_friends',
       'friends_user1', 'friends_user2', 'same_city', 'same_sex',
       'jaccard_coeff'],
      dtype='object')

In [None]:
new_train = df_train[['user1', 'user2', 'is_closed_1', 'followers_count_1', 'is_closed_2', 'followers_count_2', 'count_same_schools',
                           'count_same_universities','count_same_faculties', 'count_same_careers', 'common_friends', 'same_city', 'same_sex',
                             'friends_user1', 'friends_user2', 'jaccard_coeff']]

In [None]:
new_train['same_city'] = new_train['same_city'].astype(bool)
new_train['same_sex'] = new_train['same_sex'].astype(bool)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_train['same_city'] = new_train['same_city'].astype(bool)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_train['same_sex'] = new_train['same_sex'].astype(bool)


In [None]:
new_train.dtypes

user1                        int64
user2                        int64
is_closed_1                   bool
followers_count_1          float64
is_closed_2                   bool
followers_count_2          float64
count_same_schools           int64
count_same_universities      int64
count_same_faculties         int64
count_same_careers           int64
common_friends               int64
same_city                     bool
same_sex                      bool
friends_user1                int64
friends_user2                int64
jaccard_coeff              float64
dtype: object

In [None]:
new_train = new_train.fillna(int(new_train.followers_count_1.mean()))

In [None]:
train = links_train.merge(new_train, on=['user1', 'user2'], how='inner')
print(len(train))
train.head(3)

6356754


Unnamed: 0,user1,user2,is_friends,is_closed_1,followers_count_1,is_closed_2,followers_count_2,count_same_schools,count_same_universities,count_same_faculties,count_same_careers,common_friends,same_city,same_sex,friends_user1,friends_user2,jaccard_coeff
0,82530889,157814259,0,False,313.0,False,371.0,0,0,0,0,0,False,True,135,336,0.0
1,104274145,669799766,0,False,706.0,False,201.0,0,0,0,0,0,False,False,387,194,0.0
2,227438304,268462283,0,False,156.0,False,207.0,0,1,1,0,0,False,False,75,97,0.0


In [None]:
train, valid = train_test_split(train, test_size=0.2, stratify=train.is_friends)

In [None]:
test = links_test.merge(new_train, on=['user1', 'user2'], how='inner')
print(len(test))
test.head(3)
test = test.drop(columns=('ID'))

706407


In [None]:
X_train = train.drop(columns=('is_friends'))
y_train = train['is_friends']
X_valid = valid.drop(columns=('is_friends'))
y_valid = valid['is_friends']

In [None]:
from sklearn.metrics import balanced_accuracy_score

In [None]:
from lightgbm import LGBMClassifier

lgbm_model = LGBMClassifier(class_weight='balanced')
lgbm_model.fit(X_train, y_train, eval_set=(X_valid, y_valid))
lgbm_valid_preds = lgbm_model.predict(X_valid)

print("Train balanced accuracy: ", balanced_accuracy_score(y_valid, lgbm_valid_preds))

[LightGBM] [Info] Number of positive: 15917, number of negative: 5069486
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.270860 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1869
[LightGBM] [Info] Number of data points in the train set: 5085403, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Train balanced accuracy:  0.9484973341947143


In [None]:
lgbm_test_preds = lgbm_model.predict(test)

In [None]:
# submission file

def submission_file(test_pred, pred_name = 'drive/MyDrive/recsys_comp/lgbm_balanced'):
    return pd.Series(test_pred, name="is_friends").to_csv(
    pred_name, index_label="ID", header=True)

In [None]:
submission_file(lgbm_test_preds, pred_name = 'drive/MyDrive/recsys_comp/lgbm_balanced.csv')