# Challenge

## Tom Marzea 318443595 Michael Michaelshvili 318949443

In [17]:
%run utils.ipynb

In [18]:
import pandas as pd
import numpy as np
import os
import sys
# from utils import *
import matplotlib.pyplot as plt
%matplotlib inline
from scipy import stats
# import dill

In [19]:
import sklearn
from sklearn.svm import OneClassSVM
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, accuracy_score, confusion_matrix, top_k_accuracy_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, FunctionTransformer, MinMaxScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline, make_pipeline

In [20]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, LSTM, Dropout, Activation, Input, Embedding, TextVectorization, Reshape, Add, Concatenate, Flatten, Conv1D, Conv1DTranspose
from tensorflow.keras.losses import binary_crossentropy
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping

In [45]:
_, seg_df = load_user_data()
print(seg_df.shape)
y = pd.read_csv('challengeToFill.csv').drop(columns=['Unnamed: 0']).T.reset_index(drop=True).T
y_train = y.iloc[:LABELED_USERS, :TRAIN_SEG]
y_test_df = y.iloc[:LABELED_USERS, TRAIN_SEG:]
train_df = seg_df.iloc[:LABELED_USERS, :TRAIN_SEG]
test_df = seg_df.iloc[:LABELED_USERS, TRAIN_SEG:]
submission_df = seg_df.iloc[LABELED_USERS:, TRAIN_SEG:]
sentences = train_df.to_numpy().flatten().tolist()
# embedding_model = create_embeddings(sentences, vector_size=EMBEDDING_DIM, window=5)
# print(embedding_model.wv)
print(train_df.shape)
print(test_df.shape)

(40, 150)
(10, 50)
(10, 100)


In [22]:
all_commands = np.unique(seg_df.to_numpy().flatten().tolist()).tolist()

In [23]:
train = train_df.to_numpy().flatten()
test = test_df.to_numpy().flatten()
train.shape, test.shape

((500,), (1000,))

In [24]:
w2x = {k: idx for idx, k in enumerate(all_commands)}
x2w = {idx: k for idx, k in enumerate(all_commands)}

In [25]:
def seg_to_hist(segs, vocab=all_commands):
    count_vec = CountVectorizer(lowercase=False, vocabulary=vocab)
    features = count_vec.transform([' '.join(seg) for seg in segs])
    return features.toarray()

In [85]:
def tfidf_per_user(uid, top_n=100):
    documents = []
    for i in range(NUM_USERS):
        documents.append(' '.join([' '.join(x) for x in seg_df.iloc[i,:TRAIN_SEG]]))
    tfidf = TfidfVectorizer(ngram_range=(3,5))
    data = tfidf.fit_transform(documents)
    tfidf_features = pd.DataFrame.sparse.from_spmatrix(data, columns=tfidf.get_feature_names_out()).sparse.to_dense().T
    tfidf_features.columns = [f'user{i}' for i in range(NUM_USERS)]
    top_n_user = tfidf_features.nlargest(top_n, f'user{uid}').index.tolist()
    return top_n_user

In [86]:
def get_all_features(top_n=100):
    all_features = []
    for uid in range(NUM_USERS):
        gram1_hist = seg_to_hist(seg_df.iloc[uid, :])
        top_n_user = tfidf_per_user(uid, top_n=top_n)
        ngram_hist = seg_to_hist(seg_df.iloc[uid, :], vocab=top_n_user)
        most_common = np.expand_dims(np.array([w2x[stats.mode(x).mode[0]] for x in seg_df.iloc[uid, :]]), axis=-1)
        most_common_count = np.expand_dims(np.array([stats.mode(x).count[0] for x in seg_df.iloc[uid, :]]), axis=-1)
        num_unique = np.expand_dims(np.array([len(set(x)) for x in seg_df.iloc[uid, :]]), axis=-1)
        all_features.append(np.concatenate([gram1_hist, ngram_hist, most_common, most_common_count, num_unique], axis=1))
    return all_features

In [87]:
features_per_user_seg = get_all_features(100)
features_per_user_seg[1][0].shape

(868,)

In [88]:
def build_train_data(uid, features_per_user_seg, other_users):
    x_train = features_per_user_seg[uid][:TRAIN_SEG]
    for i in range(NUM_USERS):
        if i != uid:
            x_train = np.concatenate([x_train, features_per_user_seg[i][:other_users]])
    y_train = np.concatenate([np.zeros(TRAIN_SEG), np.ones(other_users*(NUM_USERS-1))])
    return x_train, y_train

In [110]:
def build_train_model(uid, features_per_user_seg,other_users=3, top_n=15):
    x_train, y_train = build_train_data(uid, features_per_user_seg,other_users)
    clf = MLPClassifier(random_state=42, learning_rate='adaptive', hidden_layer_sizes=(80,), max_iter=2000)
    clf.fit(x_train, y_train)
    y_test = y_test_df.iloc[uid].to_numpy()
    probas = clf.predict_proba(features_per_user_seg[uid][TRAIN_SEG:])[:,1]
    preds = [1 if p in sorted(probas)[-top_n:] else 0 for p in probas]
    return get_metrics(y_test, preds)

In [115]:
def build_test_model(uid, features_per_user_seg,other_users=3, top_n=15):
    x_train, y_train = build_train_data(uid, features_per_user_seg,other_users)
    clf = MLPClassifier(random_state=42, learning_rate='adaptive', hidden_layer_sizes=(80,), max_iter=2000)
    clf.fit(x_train, y_train)
    probas = clf.predict_proba(features_per_user_seg[uid][TRAIN_SEG:])[:,1]
    preds = [1 if p in sorted(probas)[-top_n:] else 0 for p in probas]
    return preds

In [112]:
total_score = 0
for uid in range(LABELED_USERS):
    score = build_train_model(uid,features_per_user_seg, other_users=10, top_n=15)
    total_score += score
print(total_score)

Acc: 0.95
Precision: 0.6666666666666666
Recall: 1.0
F1 Score: 0.8
Classification Score: 175
Acc: 0.91
Precision: 0.5333333333333333
Recall: 0.8
F1 Score: 0.64
Classification Score: 155
Acc: 0.95
Precision: 0.6666666666666666
Recall: 1.0
F1 Score: 0.8
Classification Score: 175
Acc: 0.93
Precision: 0.6
Recall: 0.9
F1 Score: 0.7200000000000001
Classification Score: 165
Acc: 0.93
Precision: 0.6
Recall: 0.9
F1 Score: 0.7200000000000001
Classification Score: 165
Acc: 0.83
Precision: 0.26666666666666666
Recall: 0.4
F1 Score: 0.32
Classification Score: 115
Acc: 0.93
Precision: 0.6
Recall: 0.9
F1 Score: 0.7200000000000001
Classification Score: 165
Acc: 0.82
Precision: 0.25
Recall: 0.4
F1 Score: 0.3076923076923077
Classification Score: 114
Acc: 0.95
Precision: 0.6666666666666666
Recall: 1.0
F1 Score: 0.8
Classification Score: 175
Acc: 0.95
Precision: 0.6666666666666666
Recall: 1.0
F1 Score: 0.8
Classification Score: 175
1579


In [116]:
preds = []
for uid in range(LABELED_USERS, NUM_USERS):
    preds += [build_test_model(uid,features_per_user_seg, 10, top_n=15)]

In [118]:
np.array(preds).sum(axis=1)

array([15, 15, 15, 15, 15, 16, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
       15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15])

In [119]:
import os
import shutil

version = input()

dir_path = f'submissions\\{version}'
notebook_name = f'318443595_318949443_{version}.ipynb'
if os.path.exists(dir_path):
    raise Exception(f'Directory for version {version} already exists')
os.system(f'mkdir {dir_path}')
shutil.copyfile('318443595_318949443.ipynb', f'{dir_path}\\{notebook_name}')
to_fill_df = pd.read_csv('challengeToFillOriginal.csv')
to_fill_df.iloc[LABELED_USERS: ,TRAIN_SEG+1:] = np.array(preds, dtype=int)
for col in to_fill_df.columns[1:]:
    to_fill_df[col] = to_fill_df[col].astype(int)
to_fill_df.to_csv(f'submissions/{version}/318443595_318949443_{version}.csv', index=False)

 2


FileNotFoundError: [Errno 2] No such file or directory: 'Untitled.ipynb'