In [1]:
import pandas as pd
from sklearn.feature_extraction import FeatureHasher
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from datetime import datetime
import swifter
import random
from datetime import datetime
import datetime

In [2]:
import scipy

In [3]:
from scipy.sparse import hstack

In [4]:
from fastFM import sgd

In [5]:
def ep_to_day(ep):
    return datetime.datetime.fromtimestamp(ep).weekday()

In [None]:
all_features  = ['tweet_type', 'language', 'tweet_timestamp', 'enaged_with_user_id',
       'engaged_with_user_follower_count', 'engaged_with_user_following_count',
       'enaging_user_id', 'enaging_user_follower_count',
       'enaging_user_following_count', 'engagee_follows_engager', 'reply_timestamp']

training = pd.read_csv('s3://recsys-challenge-2020/training.tsv', encoding="utf-8",\
                       names = all_features, sep="\x01", usecols = [6,7,8,9,10,11,14,15,16,19,20])

In [None]:
training.head()

In [None]:
training['day'] = training.swifter.apply(lambda x: ep_to_day(x['tweet_timestamp']), axis = 1)

In [None]:
training['reply_bool'] = training.reply_timestamp.fillna(0)
training.loc[training.reply_bool != 0.0, 'reply_bool'] = 1.0

In [None]:
training_subset = training.iloc[:, [0,1,3,4,5,6,7,8,9,11,12]]

In [None]:
training_subset.columns

In [None]:
training_subset['engagee_follows_engager'] = training_subset['engagee_follows_engager'].apply(str)
training_subset['day'] = training_subset['day'].apply(str)

In [None]:
training_subset_numerical = training_subset.iloc[:, [3,4,6,7]]

In [None]:
x = training_subset_numerical.values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df = pd.DataFrame(x_scaled)

In [None]:
training_data_all = pd.concat([training_subset, df], axis = 1)

In [None]:
[(c, training_data_all.columns.get_loc(c)) for c in training_data_all.columns]

In [None]:
training_data = training_data_all.iloc[:, [0,1,2,5,8,9,10,11,12,13,14]]

In [None]:
training_data.rename(columns = {0:'engaged_with_user_follower_count',1:'engaged_with_user_following_count',2:'enaging_user_follower_count', 3:'enaging_user_following_count'},inplace = True)

In [None]:
training_data.head()

In [None]:
training_data.set_index(keys=['day'], drop=False,inplace=True)

In [None]:
names = training_data['day'].unique().tolist()

In [None]:
names

In [None]:
train_scratch = pd.DataFrame()
val_scratch = pd.DataFrame()
for day in names:
    print(day)
    train_day = training_data.loc[training_data.day==day]
    n_head = 90
    n_tail = 10
    train_set_scratch = train_day.head(int(len(train_day)*(n_head/100)))
    val_set_scratch = train_day.tail(int(len(train_day)*(n_tail/100)))
    train_scratch = pd.concat([train_scratch, train_set_scratch])
    val_scratch = pd.concat([val_scratch, val_set_scratch])

In [None]:
train_scratch.reset_index(inplace = True, drop = True)

In [None]:
val_scratch.reset_index(inplace = True, drop = True)

In [None]:
train_scratch.head()

In [None]:
val_scratch.head()

In [None]:
X_train_df = train_scratch.drop(['reply_bool'], axis = 1)
y_train_df = train_scratch[['reply_bool']]

In [None]:
X_val_df = val_scratch.drop(['reply_bool'], axis = 1)
y_val_df = val_scratch[['reply_bool']]

In [None]:
h = FeatureHasher(n_features=1000)

In [None]:
X_train_df.head()

In [None]:
X_train_df.iloc[:, [0,1,2,3,4,5]].head()

In [None]:
X_train_hasher = h.fit_transform(X_train_df.iloc[:, [0,1,2,3,4,5]].to_dict(orient='records'))

In [None]:
X_val_hasher = h.transform(X_val_df.iloc[:, [0,1,2,3,4,5]].to_dict(orient='records'))

In [None]:
X_train_numerical = scipy.sparse.csr_matrix(X_train_df.iloc[:, [6,7,8,9]].values)

In [None]:
X_val_numerical = scipy.sparse.csr_matrix(X_val_df.iloc[:, [6,7,8,9]].values)

In [None]:
X_train_hasher

In [None]:
X_train_numerical

In [None]:
X_train = hstack([X_train_hasher, X_train_numerical])

In [None]:
X_val = hstack([X_val_hasher, X_val_numerical])

In [None]:
X_train

In [None]:
X_val

In [None]:
y_train_values = y_train_df['reply_bool'].values.squeeze()

In [None]:
y_val_values = y_val_df['reply_bool'].values.squeeze()

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
y_train_values

In [None]:
clf = LogisticRegression(random_state=0).fit(X_train, y_train_values)

In [None]:
y_predicted = clf.predict_proba(X_val)

In [None]:
y_predicted[:,1]

In [None]:
from sklearn.metrics import precision_recall_curve, auc, log_loss

def compute_prauc(pred, gt):
    prec, recall, thresh = precision_recall_curve(gt, pred)
    prauc = auc(recall, prec)
    return prauc

def calculate_ctr(gt):
    positive = len([x for x in gt if x == 1])
    ctr = positive/float(len(gt))
    return ctr

def compute_rce(pred, gt):
    cross_entropy = log_loss(gt, pred)
    data_ctr = calculate_ctr(gt)
    strawman_cross_entropy = log_loss(gt, [data_ctr for _ in range(len(gt))])
    return (1.0 - cross_entropy/strawman_cross_entropy)*100.0

In [None]:
compute_rce(y_predicted[:,1], y_val_values)

In [None]:
y_predicted

In [None]:
compute_prauc(y_predicted[:,1], y_val_values)

In [None]:
fm = sgd.FMClassification(n_iter=1000, init_stdev=0.1, rank=8, l2_reg_w=0.03, l2_reg_V=0.05, step_size = 0.01)

In [None]:
y_train_values[y_train_values == 0] = -1

In [None]:
fm.fit(X_train, y_train_values)

In [None]:
y_predicted = fm.predict_proba(X_val)

In [None]:
compute_prauc(y_predicted, y_val_values)

In [None]:
compute_rce(y_predicted, y_val_values)