In [1]:
import sys
import sklearn
import numpy as np
import pandas as pd
from time import time
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import linear_model

f"Python: {sys.version}"

'Python: 3.7.3 (default, Mar 27 2019, 22:11:17) \n[GCC 7.3.0]'

### Setup

In [2]:
trainData = pd.read_csv("tcdml1920-rec-click-pred--training.csv", index_col='recommendation_set_id', low_memory=False)
testData = pd.read_csv("tcdml1920-rec-click-pred--test.csv", index_col='recommendation_set_id', low_memory=False)

testData.dropna(how="all", inplace=True)

In [4]:
trainData.replace(["\\N", "nA", "Not provided", "unknown", "*unknown*"], np.nan, inplace=True)
testData.replace(["\\N", "nA", "Not provided",  "unknown", "*unknown*"], np.nan, inplace=True)

In [5]:
trainData.isnull().sum().sort_values()

set_clicked                              0
ctr                                      0
rec_processing_time                      0
response_delivered                       0
hour_request_received                    0
request_received                         0
application_type                         0
organization_id                          0
number_of_recs_in_set                    0
clicks                                   0
search_keywords                          0
search_abstract                          0
search_title                             0
query_identifier                       154
query_char_count                       154
query_word_count                       154
country_by_ip                         1584
query_detected_language               3597
recommendation_algorithm_id_used     10677
algorithm_class                      13026
app_lang                             16697
item_type                            36223
timezone_by_ip                       76209
local_hour_

In [None]:
numeric_columns = [
    "query_word_count",
    "query_char_count",
    "query_document_id",
    "year_published",
    "number_of_authors",
    "abstract_char_count",
    "abstract_word_count",
    "first_author_id",
    "num_pubs_by_first_author",
    "hour_request_received",
    "local_hour_of_request",
    "recommendation_algorithm_id_used",
    "clicks",
]

In [None]:
time_columns = [
    "request_received",
    "response_delivered",
    "local_time_of_request",
    "time_recs_recieved",
    "time_recs_displayed",
    "time_recs_viewed",
]

In [None]:
for col in numeric_columns:
    trainData[col] = pd.to_numeric(trainData[col], errors="coerce")
    testData[col] = pd.to_numeric(testData[col], errors="coerce")

In [None]:
for col in time_columns:
    trainData[col] = pd.to_datetime(trainData[col], format="%d/%m/%Y %H:%M")
    testData[col] = pd.to_datetime(testData[col], format="%d/%m/%Y %H:%M")

In [None]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 5)
trainData["query_document_id"].value_counts()

In [None]:
trainData.columns

In [None]:
trainData.dtypes

In [None]:
trainData['set_clicked'].value_counts().plot(kind='pie', figsize=(8,8))

In [None]:
trainData.corr().style.background_gradient(cmap='coolwarm', axis=None)

In [None]:
f = plt.figure(figsize=(19, 15))
plt.matshow(trainData.corr(), fignum=f.number)
plt.xticks(range(trainData.shape[1]), trainData.columns, fontsize=14, rotation=45)
plt.yticks(range(trainData.shape[1]), trainData.columns, fontsize=14)
cb = plt.colorbar()
cb.ax.tick_params(labelsize=14)
plt.title('Correlation Matrix', fontsize=16);

In [None]:
# Row Duplication check TrainData
duplicateRowsTrain = trainData[trainData.duplicated()]
duplicateRowsTrain.shape

285 Duplicated train rows

In [None]:
# Row Duplication check TestData
duplicateRowsTest = testData[testData.duplicated()]
duplicateRowsTest.shape

### Basic visualizations

In [None]:
plt.figure(figsize=(24, 4))
# sns.countplot(x="query_identifier", data=train_set)
# plt.show()
sns.countplot(x="document_language_provided", data=train_set)
plt.show()
plt.figure(figsize=(24, 4))
sns.countplot(x="abstract_detected_language", data=train_set)
plt.show()
sns.countplot(x="application_type", data=train_set)
plt.show()
plt.figure(figsize=(24, 4))
sns.countplot(x="item_type", data=train_set)
plt.show()
plt.figure(figsize=(24, 4))
sns.countplot(x="app_version", data=train_set, order=pd.value_counts(
    train_set['app_version']).iloc[:10].index)
plt.show()
sns.countplot(x="app_lang", data=train_set)
plt.show()
sns.countplot(x="user_os", data=train_set)
plt.show()
# sns.countplot(x="user_os_version", data=train_set)
# plt.show()
# sns.countplot(x="user_java_version", data=train_set)
# plt.show()
plt.figure(figsize=(24, 4))
sns.countplot(x="user_timezone", data=train_set,
              order=pd.value_counts(train_set['user_timezone']).index)
plt.show()
plt.figure(figsize=(24, 4))
sns.countplot(x="country_by_ip", data=train_set, order=pd.value_counts(
    train_set['country_by_ip']).iloc[:15].index)
plt.show()
plt.figure(figsize=(24, 4))
sns.countplot(x="timezone_by_ip", data=train_set, order=pd.value_counts(
    train_set['timezone_by_ip']).iloc[:15].index)
plt.show()
plt.figure(figsize=(24, 4))
sns.countplot(x="algorithm_class", data=train_set)
plt.show()

### Encoding
- One hot
- Target

- Smote 
- Giving more weight to yes