# Untitled

In [54]:
%run utils.ipynb

In [55]:
import pandas as pd
import numpy as np
import os
import sys
# from utils import *
import matplotlib.pyplot as plt
%matplotlib inline
from scipy import stats
# import dill

In [56]:
import sklearn
from sklearn.svm import OneClassSVM
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, accuracy_score, confusion_matrix, top_k_accuracy_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, FunctionTransformer, MinMaxScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline, make_pipeline

In [57]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, LSTM, Dropout, Activation, Input, Embedding, TextVectorization, Reshape, Add, Concatenate, Flatten, Conv1D, Conv1DTranspose
from tensorflow.keras.losses import binary_crossentropy
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping

In [58]:
_, seg_df = load_user_data()
print(seg_df.shape)
y = pd.read_csv('challengeToFill.csv').drop(columns=['Unnamed: 0']).T.reset_index(drop=True).T
y_train = y.iloc[:LABELED_USERS, :TRAIN_SEG]
y_test_df = y.iloc[:LABELED_USERS, TRAIN_SEG:]
train_df = seg_df.iloc[:LABELED_USERS, :TRAIN_SEG]
test_df = seg_df.iloc[:LABELED_USERS, TRAIN_SEG:]
submission_df = seg_df.iloc[LABELED_USERS:, TRAIN_SEG:]
sentences = train_df.to_numpy().flatten().tolist()
# embedding_model = create_embeddings(sentences, vector_size=EMBEDDING_DIM, window=5)
# print(embedding_model.wv)
print(train_df.shape)
print(test_df.shape)

(40, 150)
(10, 50)
(10, 100)


In [138]:
all_commands = np.unique(seg_df.to_numpy().flatten().tolist()).tolist()

In [139]:
train = train_df.to_numpy().flatten()
test = test_df.to_numpy().flatten()
train.shape, test.shape

((500,), (1000,))

In [61]:
w2x = {k: idx for idx, k in enumerate(all_commands)}
x2w = {idx: k for idx, k in enumerate(all_commands)}

In [141]:
def seg_to_hist(segs, vocab=all_commands):
    count_vec = CountVectorizer(lowercase=False, vocabulary=vocab)
    features = count_vec.transform([' '.join(seg) for seg in segs])
    return features.toarray()

In [143]:
def tfidf_per_user(uid, top_n=100):
    documents = []
    for i in range(NUM_USERS):
        documents.append(' '.join([' '.join(x) for x in seg_df.iloc[i,:TRAIN_SEG]]))
    tfidf = TfidfVectorizer(ngram_range=(3,3))
    data = tfidf.fit_transform(documents)
    tfidf_features = pd.DataFrame.sparse.from_spmatrix(data, columns=tfidf.get_feature_names_out()).sparse.to_dense().T
    tfidf_features.columns = [f'user{i}' for i in range(NUM_USERS)]
    top_n_user = tfidf_features.nlargest(top_n, f'user{uid}').index.tolist()
    return top_n_user

In [254]:
def get_all_features(top_n=100):
    all_features = []
    for uid in range(NUM_USERS):
        gram1_hist = seg_to_hist(seg_df.iloc[uid, :])
        top_n_user = tfidf_per_user(uid, top_n=top_n)
        ngram_hist = seg_to_hist(seg_df.iloc[uid, :], vocab=top_n_user)
        # print(gram1_hist.shape)
        most_common = np.expand_dims(np.array([w2x[stats.mode(x).mode[0]] for x in seg_df.iloc[uid, :]]), axis=-1)
        most_common_count = np.expand_dims(np.array([stats.mode(x).count[0] for x in seg_df.iloc[uid, :]]), axis=-1)
        num_unique = np.expand_dims(np.array([len(set(x)) for x in seg_df.iloc[uid, :]]), axis=-1)
        # ngram_hist = ngram_to_histogram(seg_df.iloc[uid, :])
        all_features.append(np.concatenate([gram1_hist, ngram_hist, most_common, num_unique], axis=1))
    return all_features

In [263]:
features_per_user_seg = get_all_features(10)
features_per_user_seg[1][0].shape

In [265]:
def build_train_data(uid, features_per_user_seg, other_users):
    x_train = features_per_user_seg[uid][:TRAIN_SEG]
    for i in range(NUM_USERS):
        if i != uid:
            x_train = np.concatenate([x_train, features_per_user_seg[i][:other_users]])
    y_train = np.concatenate([np.zeros(TRAIN_SEG), np.ones(other_users*(NUM_USERS-1))])
    return x_train, y_train

In [308]:
def build_train_model(uid, features_per_user_seg,other_users=3, top_n=15):
    x_train, y_train = build_train_data(uid, features_per_user_seg,other_users)
    clf = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000)
    clf.fit(x_train, y_train)
    y_test = y_test_df.iloc[uid].to_numpy()
    probas = clf.predict_proba(features_per_user_seg[uid][TRAIN_SEG:])[:,1]
    preds = [1 if p in sorted(probas)[-top_n:] else 0 for p in probas]
    return get_metrics(y_test, preds)

In [309]:
def build_test_model(uid, features_per_user_seg,other_users=3, top_n=15):
    x_train, y_train = build_train_data(uid, features_per_user_seg,other_users)
    clf = MLPClassifier(learning_rate='adaptive', hidden_layer_sizes=(100,), max_iter=1000)
    clf.fit(x_train, y_train)
    probas = clf.predict_proba(features_per_user_seg[uid][TRAIN_SEG:])[:,1]
    preds = [1 if p in sorted(probas)[-top_n:] else 0 for p in probas]
    return preds

In [313]:
total_score = 0
for uid in range(LABELED_USERS):
    score = build_train_model(uid,features_per_user_seg, other_users=2, top_n=20)
    total_score += score
print(total_score)

Acc: 0.88
Precision: 0.45
Recall: 0.9
F1 Score: 0.6
Classification Score: 160
Final Grade: 9.967213114754099
Acc: 0.88
Precision: 0.45
Recall: 0.9
F1 Score: 0.6
Classification Score: 160
Final Grade: 9.967213114754099
Acc: 0.9
Precision: 0.5
Recall: 1.0
F1 Score: 0.6666666666666666
Classification Score: 170
Final Grade: 10.59016393442623
Acc: 0.88
Precision: 0.45
Recall: 0.9
F1 Score: 0.6
Classification Score: 160
Final Grade: 9.967213114754099
Acc: 0.86
Precision: 0.4
Recall: 0.8
F1 Score: 0.5333333333333333
Classification Score: 150
Final Grade: 9.344262295081966
Acc: 0.82
Precision: 0.3
Recall: 0.6
F1 Score: 0.4
Classification Score: 130
Final Grade: 8.098360655737705
Acc: 0.87
Precision: 0.43478260869565216
Recall: 1.0
F1 Score: 0.6060606060606061
Classification Score: 167
Final Grade: 10.40327868852459
Acc: 0.79
Precision: 0.23809523809523808
Recall: 0.5
F1 Score: 0.3225806451612903
Classification Score: 119
Final Grade: 7.413114754098361
Acc: 0.83
Precision: 0.37037037037037035
R

In [314]:
preds = []
for uid in range(LABELED_USERS, NUM_USERS):
    preds += [build_test_model(uid,features_per_user_seg, 3)]

In [315]:
np.array(preds).sum(axis=1)

array([15, 15, 15, 15, 15, 16, 15, 15, 17, 15, 15, 15, 15, 15, 15, 15, 15,
       15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15])

In [None]:
import os
import shutil

version = input()

dir_path = f'submissions\\{version}'
notebook_name = f'318443595_318949443_{version}.ipynb'
if not os.path.exists(dir_path):
    os.system(f'mkdir {dir_path}')

shutil.copyfile('Untitled.ipynb', f'{dir_path}\\{notebook_name}')
to_fill_df = pd.read_csv('challengeToFillOriginal.csv')
to_fill_df.iloc[LABELED_USERS: ,TRAIN_SEG+1:] = np.array(preds, dtype=int)
for col in to_fill_df.columns[1:]:
    to_fill_df[col] = to_fill_df[col].astype(int)
to_fill_df.to_csv(f'submissions/{version}/318443595_318949443_{version}.csv', index=False)