In [2]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"
from sklearn.impute import KNNImputer
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler

# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

# Common imports
import numpy as np
import os
import pandas as pd
import re

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

ModuleNotFoundError: No module named 'tensorflow'

In [1]:
from platform import python_version

print(python_version())

3.7.13


In [None]:
X_train = pd.read_csv("./train.csv")
Y_train = pd.read_csv("./train_labels.csv")
test_full = pd.read_csv("./test.csv")

In [None]:
# def save_to_multiple_csv_files(data, name_prefix, header=None, n_parts=10):
#     data_dir = os.path.join("datasets", "kaggle_comp")
#     os.makedirs(data_dir, exist_ok=True)
#     path_format = os.path.join(data_dir, "my_{}_{:02d}.csv")

#     filepaths = []
#     m = len(data)
#     for file_idx, row_indices in enumerate(np.array_split(np.arange(m), n_parts)):
#         part_csv = path_format.format(name_prefix, file_idx)
#         filepaths.append(part_csv)
#         with open(part_csv, "wt", encoding="utf-8") as f:
#             if header is not None:
#                 f.write(header)
#                 f.write("\n")
#             for row_idx in row_indices:
#                 f.write(",".join([repr(col) for col in data.loc[row_idx]]))
#                 f.write("\n")
#     return filepaths

In [None]:
# Function to reduce memory
def reduce_memory(df):   
    for col in df.columns:
        col_type = df[col].dtype.name
        
        #Only focuses on numerical data (categorical data is handled later)
        if ((col_type != 'datetime64[ns]') & (col_type != 'category')): #DateTime object and Category object
            if (col_type != 'object'): #Object type
                col_min = df[col].min()
                col_max = df[col].max()

                #Only focuses on if the type of the attribute is of type 'int'
                # np.iinfo() finds the Machine Limits for the data type
                if str(col_type)[:3] == 'int':
                    #Case 1: If the Machine Limits of the attribute fall between those of type int8
                    if col_min > np.iinfo(np.int8).min and col_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8) #Changes the type to int8
                    #Case 2: If the Machine Limits of the attribute fall between those of type int16
                    elif col_min > np.iinfo(np.int16).min and col_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16) #Changes the type to int16
                    #Case 3: If the Machine Limits of the attribute fall between those of type int32
                    elif col_min > np.iinfo(np.int32).min and col_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32) #Changes the type to int32
                    #Case 4: If the Machine Limits of the attribute fall between those of type int64
                    elif col_min > np.iinfo(np.int64).min and col_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64) #Changes the type to int64

                #Only focuses on if the type of the attribute is of type 'float'
                # np.finfo() finds the Machine Limits for the data type
                else:
                    #Case 1: If the Machine Limits of the attribute fall between those of type float16
                    if col_min > np.finfo(np.float16).min and col_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    #Case 1: If the Machine Limits of the attribute fall between those of type float32
                    elif col_min > np.finfo(np.float32).min and col_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    #All other cases doesn;t change
                    else:
                        pass
            
            #If the attribute is an object than it will change its type to category
            else:
                df[col] = df[col].astype('category')
    
    return df

In [None]:
train_df = reduce_memory(X_train)
train_df.info()

In [None]:
train_df.shape

In [None]:
train_df.head()

In [None]:
labels_df = reduce_memory(Y_train)
labels_df.info()

In [None]:
labels_df['user_id']=labels_df.session_id.str.split("_", expand = True)[0]

In [None]:
labels_df.head()

In [None]:
labels_df["level"] = labels_df.session_id.str.split("_", expand = True)[1]
labels_df["level"] = labels_df["level"].apply(lambda x : re.sub("\D", "",x)) 
labels_df["level"] = pd.to_numeric(labels_df["level"])
labels_df["user_id"] = pd.to_numeric(labels_df["user_id"])
labels_df["session_level"] = labels_df["level"].apply(lambda x: 0 if x <= 4 else 1 if x >= 5 and x <= 12 else 2)

In [None]:
labels_df.head()

In [None]:
# Questions 1-4 belong to level 1, 5-12 to level 2, 13 - 22 to level 3
labels_df.level.unique()

In [None]:
print("Number of unique users: ",len(labels_df.user_id.unique()))
print("Number of unique sessions: ",len(labels_df.session_id.unique()))

In [None]:
train_df.isnull().sum()

In [None]:
## Skipping this for now, it is generating issues on the csv, adding a new column randomly in random rows
# header_cols = X_train.columns
# header = ",".join(header_cols)
# train_filepaths = save_to_multiple_csv_files(X_train, "train", header, n_parts=100)

In [None]:
numeric_feature_names = ['session_id', 'index', 'elapsed_time', 'level',
       'page', 'room_coor_x', 'room_coor_y', 'screen_coor_x', 'screen_coor_y',
       'hover_duration', 'fullscreen', 'hq', 'music']
numeric_features = train_df[numeric_feature_names].copy()
numeric_features.head()

In [None]:
# Based on the data described in the notebook, this is an MNAR type, meaning, the value is missing not at random 
numeric_features.isnull().sum()

In [None]:
numeric_features.shape,labels_df.shape

In [None]:
len(numeric_features['session_id'].unique()),len(Y_train['user_id'].unique())

In [None]:
numeric_features['hover_duration'].describe()

In [None]:
# # Generating new data based on the users average time per level
# def time_per_level(data,users_id):
#     res = pd.DataFrame([])
#     filtered_features =[]
#     for user in users_id:
#         for level in range(23):
#             filtered_features = data[(data['session_id'] == user ) & (data['level'] == level)]
#             avg_time = filtered_features['elapsed_time'].mean()
#             res = pd.concat([res, pd.DataFrame({'time_per_level': [avg_time],'user_id':user,'level':level})], ignore_index=True)
#     return res
# val = time_per_level(X_train,Y_train.user_id.unique())

In [None]:
# Creating a copy of my labels so I can modify the column names and keep the raw dataset intact
labels_df_cp= labels_df.copy()
labels_df_cp.rename(columns = {'session_id':'session_res','user_id':'session_id'}, inplace = True)
labels_df_cp.head()

In [None]:
# There is no level 0 in the training labels provided, how should we handle this?
train_df_cp = train_df.copy()
df_full = pd.merge(train_df_cp, labels_df_cp, how='inner',on=['session_id','level'])

In [None]:
df_full.head()

In [None]:
df_full.shape

In [None]:
# def get_group_level(q):
#     qno = int(q[1:])
#     if qno < 4:
#         return '0-4'
#     elif qno < 14:
#         return '5-12'
#     return '13-22'

In [None]:
# train_label['q'] = train_label['session_id'].apply(lambda s: s.split("_")[-1])
# train_label['level_group'] = train_label.q.apply(get_group_level)

In [None]:
scaler = MinMaxScaler()
scaler.fit(df_full[['elapsed_time', 'fullscreen','room_coor_x','room_coor_y','screen_coor_x',
                    'screen_coor_y','hover_duration']])
training_data_scaled = scaler.transform(df_full[['elapsed_time', 'fullscreen','room_coor_x','room_coor_y','screen_coor_x',
                                                 'screen_coor_y','hover_duration']])
training_data_scaled = pd.DataFrame(df_full, columns=['elapsed_time_scaled', 'fullscreen_scaled','room_coor_x_scaled',
                                                      'room_coor_y_scaled','screen_coor_x_scaled','screen_coor_y_scaled',
                                                      'hover_duration_scaled'])
df_full = pd.concat([df_full, training_data_scaled], axis=1)

In [None]:
print('Dataset shape: ',df_full.shape,'\n')
df_full.head()

In [None]:
# test_data=df_full[['elapsed_time_scaled','fullscreen_scaled']].copy()
# label_data=df_full[['correct']].copy()
# training_data.info(),label_data.info()

In [None]:
# test.info()

In [None]:
# scaler.fit(test[['elapsed_time', 'fullscreen']])
# test_data_scaled = scaler.transform(test[['elapsed_time', 'fullscreen']])
# test_data_scaled = pd.DataFrame(test, columns=['elapsed_time_scaled', 'fullscreen_scaled'])
# test = pd.concat([test, test_data_scaled], axis=1)
# test_data=test[['elapsed_time_scaled','fullscreen_scaled']].copy()
# test_data.info()

In [None]:
training_data=df_full[['elapsed_time_scaled','fullscreen_scaled']]
label_data=df_full[['correct']]
print('Training data shape: ',training_data.shape,'\n','Label data shape: ',label_data.shape)

In [None]:
x_train,x_val = training_data[:int(len(training_data)*.8)],training_data[int(len(training_data)*.8):]
y_train,y_val = label_data[:int(len(label_data)*.8)],label_data[int(len(label_data)*.8):]
print('X train shape: ',x_train.shape,'\n','X valid shape: ',x_val.shape)

In [None]:
model = keras.models.Sequential([
    keras.layers.Dense(10, input_shape=(2,),activation="relu"),
    keras.layers.Dense(1, activation="sigmoid")
])

In [None]:
model.compile(loss="binary_crossentropy",
              optimizer="adam",
              metrics=["accuracy"])

In [None]:
history = model.fit(x_train, y_train, epochs=5,
                    validation_data=(x_val, y_val))

In [None]:
# test.loc[2,['elapsed_time','fullscreen']].values

In [None]:
# X_new = test.loc[2,['elapsed_time','fullscreen']].values
# X_new = X_new.astype('float32')
# y_proba = model.predict(X_new.reshape(1, 2))

In [None]:
# y_proba

In [1]:
import jo_wilder
env = jo_wilder.make_env()
iter_test = env.iter_test()

ModuleNotFoundError: No module named 'jo_wilder.competition'

In [None]:
# limits = {'0-4':(1,4), '5-12':(4,14), '13-22':(14,19)}

# for (test, sample_submission) in iter_test:
    
#     dummies = pd.get_dummies(test['event_name'])
#     test = pd.concat([test, dummies], axis=1)
#     df = feature_engineer(test)
#     grp = test.level_group.values[0]
#     a,b = limits[grp]
#     for t in range(a,b):
#         clf = models[f'{grp}_{t}']
#         p = clf.predict_proba(df[FEATURES].astype('float32'))[:,1]
#         pint = [int(x>best_threshold) for x in p ]
#         mask = sample_submission.session_id.str.endswith(f'q{t}')
#         sample_submission.loc[mask,'correct'] = pint
    
#     env.predict(sample_submission)

# print("Your submission was successfully saved!")