In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import math
import os
import sklearn
import numpy as np
import pandas as pd
from time import time
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import linear_model
%config IPcompletor.greedy = True
# print("Python: {}".format(sys.version))
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression


##### Change your path according to your dataset's directory

In [0]:
import csv
path='drive/My Drive/ml_colab/'
train_path=path+'rec_pred_train.csv'
test_path=path+'rec_pred_test.csv'

trainData = pd.read_csv(train_path, index_col = 0,error_bad_lines=True)#385688
testData = pd.read_csv(test_path, index_col = 0,error_bad_lines=False)#9146
testData.dropna(how="all", inplace=True)

  interactivity=interactivity, compiler=compiler, result=result)


# Data Cleaning

In [0]:
trainData.replace(["\\N", "nA", "Not provided", "NaN"], np.nan, inplace=True)
testData.replace(["\\N", "nA", "Not provided", "NaN"], np.nan, inplace=True)

In [0]:
numeric_columns = [
    "query_word_count",
    "query_char_count",
    "query_document_id",
    "year_published",
    "number_of_authors",
    "abstract_char_count",
    "abstract_word_count",
    "first_author_id",
    "num_pubs_by_first_author",
    "hour_request_received",
    "local_hour_of_request",
    "recommendation_algorithm_id_used",
    "clicks",
]
time_columns = [
    "request_received",
    "response_delivered",
    "local_time_of_request",
    "time_recs_recieved",
    "time_recs_displayed",
    "time_recs_viewed",
]
for col in numeric_columns:
    trainData[col] = pd.to_numeric(trainData[col], errors="coerce")
    testData[col] = pd.to_numeric(testData[col], errors="coerce")
for col in time_columns:
    trainData[col] = pd.to_datetime(trainData[col], format="%d/%m/%Y %H:%M")
    testData[col] = pd.to_datetime(testData[col], format="%d/%m/%Y %H:%M")

In [0]:
trainData.isnull().sum().sort_values()

set_clicked                              0
ctr                                      0
rec_processing_time                      0
response_delivered                       0
hour_request_received                    0
request_received                         0
application_type                         0
organization_id                          0
algorithm_class                          0
number_of_recs_in_set                    0
search_title                             0
search_keywords                          0
search_abstract                          0
clicks                                   0
query_char_count                       154
query_word_count                       154
query_identifier                       154
country_by_ip                         1584
query_detected_language               3597
recommendation_algorithm_id_used     10677
app_lang                             16697
item_type                            36223
timezone_by_ip                       76209
local_hour_

In [0]:
# Splitting the dataset by organisation_id (1,4,8 ~ Blog, Jabref, MyVolts **in no specific order)

o_id1_train = trainData[trainData['organization_id'] == 1]
o_id4_train = trainData[trainData['organization_id'] == 4]
o_id8_train = trainData[trainData['organization_id'] == 8]

o_id1_test = testData[testData['organization_id'] == 1]
o_id4_test = testData[testData['organization_id'] == 4]
o_id8_test = testData[testData['organization_id'] == 8]

del o_id1_train['organization_id']
del o_id4_train['organization_id']
del o_id8_train['organization_id']

del o_id1_test['organization_id']
del o_id4_test['organization_id']
del o_id8_test['organization_id']

In [0]:
# Checking if the lengths match

print(len(trainData))
print(len(o_id1_train))
print(len(o_id4_train))
print(len(o_id8_train))
print('\n')
print(len(testData))
print(len(o_id1_test))
print(len(o_id4_test))
print(len(o_id8_test))

385687
270246
100215
15226


9145
5176
3029
940


In [0]:
### Printing the nan values by column for train data, classified by organisation_id

print('\n Organization Id 1')
print(o_id1_train.isnull().sum().sort_values())
print(o_id1_train.isnull().values.any()) #Prints True if atleast 1 Nan present
print(o_id1_train.isnull().sum().sum()) #Prints total no of nan values
print(len(o_id1_train)) #Prints total no of rows (length) in DataFrame

print('\n Organization Id 4')
print(o_id4_train.isnull().sum().sort_values())
print(o_id4_train.isnull().values.any())
print(o_id4_train.isnull().sum().sum())
print(len(o_id4_train))

print('\n Organization Id 4')
print(o_id8_train.isnull().sum().sort_values())
print(o_id8_train.isnull().values.any())
print(o_id8_train.isnull().sum().sum())
print(len(o_id8_train))
print('\n')


 Organization Id 1
set_clicked                              0
clicks                                   0
query_identifier                         0
query_word_count                         0
query_char_count                         0
search_abstract                          0
search_keywords                          0
search_title                             0
algorithm_class                          0
number_of_recs_in_set                    0
ctr                                      0
rec_processing_time                      0
response_delivered                       0
hour_request_received                    0
request_received                         0
application_type                         0
app_version                             84
item_type                              101
app_lang                              1154
country_by_ip                         1280
query_detected_language               3443
recommendation_algorithm_id_used     10677
timezone_by_ip                    

In [0]:
### Printing the nan values by column for test data, classified by organisation_id
print('\n Organization Id 1')
print(o_id1_test.isnull().sum().sort_values())
print(o_id1_test.isnull().values.any())
print(o_id1_test.isnull().sum().sum())
print(len(o_id1_test))
print('\n Organization Id 4')
print(o_id4_test.isnull().sum().sort_values())
print(o_id4_test.isnull().values.any())
print(o_id4_test.isnull().sum().sum())
print(len(o_id4_test))
print('\n Organization Id 8')
print(o_id8_test.isnull().sum().sort_values())
print(o_id8_test.isnull().values.any())
print(o_id8_test.isnull().sum().sum())
print(len(o_id8_test))
print('\n')


 Organization Id 1
request_received                       0
search_title                           0
query_identifier                       0
query_word_count                       0
query_char_count                       0
search_keywords                        0
search_abstract                        0
hour_request_received                  0
algorithm_class                        0
application_type                       0
item_type                              2
app_version                            2
app_lang                              19
country_by_ip                         31
query_detected_language               51
recommendation_algorithm_id_used     229
timezone_by_ip                       329
local_hour_of_request                509
local_time_of_request                509
cbf_parser                          1379
query_document_id                   3645
year_published                      3737
number_of_authors                   3766
first_author_id                     3

In [0]:
### Printing the nan values by column for train data with set clicked 1, classified by organisation_id

set_1_train_od1=o_id1_train[o_id1_train['set_clicked'] == 1]
set_1_train_od4=o_id4_train[o_id4_train['set_clicked'] == 1]
set_1_train_od8=o_id8_train[o_id8_train['set_clicked'] == 1]

print('\n Organization Id 1 for set_clicked=1')
print(set_1_train_od1.isnull().sum().sort_values())
print(set_1_train_od1.isnull().values.any())
print(set_1_train_od1.isnull().sum().sum())
print(len(set_1_train_od1))
print('\n Organization Id 4 for set_clicked=1')

print(set_1_train_od4.isnull().sum().sort_values())
print(set_1_train_od4.isnull().values.any())
print(set_1_train_od4.isnull().sum().sum())
print(len(set_1_train_od4))
print('\n Organization Id 8 for set_clicked=1')

print(set_1_train_od8.isnull().sum().sort_values())
print(set_1_train_od8.isnull().values.any())
print(set_1_train_od8.isnull().sum().sum())
print(len(set_1_train_od8))
print('\n')


 Organization Id 1 for set_clicked=1
set_clicked                            0
clicks                                 0
query_identifier                       0
query_word_count                       0
query_char_count                       0
search_abstract                        0
search_keywords                        0
search_title                           0
algorithm_class                        0
number_of_recs_in_set                  0
ctr                                    0
rec_processing_time                    0
response_delivered                     0
hour_request_received                  0
request_received                       0
application_type                       0
app_version                            8
item_type                              8
query_detected_language               30
app_lang                              38
country_by_ip                         42
recommendation_algorithm_id_used     278
timezone_by_ip                       331
local_hour_of_reque

In [0]:
# Dropping columns that are completely and almost completely nan for set clicked 1 for each organisation
o_id1_train=o_id1_train.drop(["user_java_version", "user_os_version", "session_id","user_id","time_recs_viewed","time_recs_displayed","time_recs_recieved","user_timezone","user_os","document_language_provided","abstract_detected_language","abstract_char_count","abstract_word_count"], axis=1)
o_id1_test=o_id1_test.drop(["user_java_version", "user_os_version", "session_id","user_id","time_recs_viewed","time_recs_displayed","time_recs_recieved","user_timezone","user_os","document_language_provided","abstract_detected_language","abstract_char_count","abstract_word_count"], axis=1)

o_id4_train=o_id4_train.drop(["app_version","number_of_authors","user_timezone","user_os","num_pubs_by_first_author","first_author_id","user_os_version","user_java_version","year_published","document_language_provided","time_recs_viewed"],axis=1)
o_id4_test=o_id4_test.drop(["app_version","number_of_authors","user_timezone","user_os","num_pubs_by_first_author","first_author_id","user_os_version","user_java_version","year_published","document_language_provided","time_recs_viewed"],axis=1)

o_id8_train=o_id8_train.drop(["app_version","user_timezone","session_id","document_language_provided","year_published","number_of_authors","first_author_id","num_pubs_by_first_author","app_lang","user_os","user_os_version","user_java_version","user_id","time_recs_viewed"],axis=1)
o_id8_test=o_id8_test.drop(["app_version","user_timezone","session_id","document_language_provided","year_published","number_of_authors","first_author_id","num_pubs_by_first_author","app_lang","user_os","user_os_version","user_java_version","user_id","time_recs_viewed"],axis=1)

In [0]:
### Printing the nan values by column for train data, classified by organisation_id

print('\n Organization Id 1')
print(o_id1_train.isnull().sum().sort_values())
print(o_id1_train.isnull().values.any())
print(o_id1_train.isnull().sum().sum())
print(len(o_id1_train))
print('\n Organization Id 4')
print(o_id4_train.isnull().sum().sort_values())
print(o_id4_train.isnull().values.any())
print(o_id4_train.isnull().sum().sum())
print(len(o_id4_train))
print('\n Organization Id 8')
print(o_id8_train.isnull().sum().sort_values())
print(o_id8_train.isnull().values.any())
print(o_id8_train.isnull().sum().sum())
print(len(o_id8_train))
print('\n')


 Organization Id 1
query_identifier                         0
clicks                                   0
search_abstract                          0
search_keywords                          0
search_title                             0
algorithm_class                          0
number_of_recs_in_set                    0
ctr                                      0
rec_processing_time                      0
response_delivered                       0
hour_request_received                    0
set_clicked                              0
application_type                         0
query_word_count                         0
query_char_count                         0
request_received                         0
app_version                             84
item_type                              101
app_lang                              1154
country_by_ip                         1280
query_detected_language               3443
recommendation_algorithm_id_used     10677
timezone_by_ip                    

In [0]:
# NOT THE BEST WAY TO FILL THE REMAINING NANs..... HAVE TO FIND ALTERNTATIVES FOR BETTER PERFORMANCE

# o_id1_train[o_id1_train.columns]=o_id1_train[o_id1_train.columns].fillna(o_id1_train.mode(dropna=True).iloc[0])
# o_id4_train[o_id4_train.columns]=o_id4_train[o_id4_train.columns].fillna(o_id4_train.mode(dropna=True).iloc[0])
# o_id8_train[o_id8_train.columns]=o_id8_train[o_id8_train.columns].fillna(o_id8_train.mode(dropna=True).iloc[0])

# HAVE TO TRY THIS.
# ALSO HAVE TO TRY FILLING 'YES_NANS' WITH MODE OF 'YES'.
o_id1_train.dropna(how='any',inplace=True)
o_id4_train.dropna(how='any',inplace=True)
o_id8_train.dropna(how='any',inplace=True)


o_id1_test[o_id1_test.columns]=o_id1_test[o_id1_test.columns].fillna(o_id1_train.mode(dropna=True).iloc[0])
o_id4_test[o_id4_test.columns]=o_id4_test[o_id4_test.columns].fillna(o_id4_train.mode(dropna=True).iloc[0])
o_id8_test[o_id8_test.columns]=o_id8_test[o_id8_test.columns].fillna(o_id8_train.mode(dropna=True).iloc[0])

In [0]:
### Printing the nan values by column for train data, classified by organisation_id
print('\n Organization Id 1')
print(o_id1_train.isnull().sum().sort_values())
print(o_id1_train.isnull().values.any())
print(o_id1_train.isnull().sum().sum())
print(len(o_id1_train))
print('\n Organization Id 4')
print(o_id4_train.isnull().sum().sort_values())
print(o_id4_train.isnull().values.any())
print(o_id4_train.isnull().sum().sum())
print(len(o_id4_train))
print('\n Organization Id 8')
print(o_id8_train.isnull().sum().sort_values())
print(o_id8_train.isnull().values.any())
print(o_id8_train.isnull().sum().sum())
print(len(o_id8_train))
print('\n')


 Organization Id 1
query_identifier                    0
clicks                              0
search_abstract                     0
search_keywords                     0
search_title                        0
cbf_parser                          0
algorithm_class                     0
recommendation_algorithm_id_used    0
number_of_recs_in_set               0
local_hour_of_request               0
local_time_of_request               0
timezone_by_ip                      0
country_by_ip                       0
app_lang                            0
ctr                                 0
app_version                         0
response_delivered                  0
hour_request_received               0
request_received                    0
item_type                           0
application_type                    0
num_pubs_by_first_author            0
first_author_id                     0
number_of_authors                   0
year_published                      0
query_document_id             

# Encoding

In [0]:
# Trying to make 3 classifiers, with o_id1_train, o_id4_train, o_id8_train

In [0]:
# Splitting X and y

y_id1=o_id1_train['set_clicked']
y_id4=o_id4_train['set_clicked']
y_id8=o_id8_train['set_clicked']
del o_id1_train['set_clicked']
del o_id4_train['set_clicked']
del o_id8_train['set_clicked']

In [0]:
# Deleting y in test
del o_id1_test['set_clicked']
del o_id4_test['set_clicked']
del o_id8_test['set_clicked']

In [0]:
!pip install category_encoders #Install if this package is absent
from category_encoders.target_encoder import TargetEncoder
from sklearn.preprocessing import StandardScaler
#Function to both Target Encode Categorical Data and Normalise Numerical Data (NOTE: THIS FUNCTION REMOVES TIME TO TARGET ENCODE)
def t_encode_func(o_id1_train,y_id1,o_id1_test):
  
  #Categorical Data to Target Encode
  col = list(o_id1_train.select_dtypes(include = ["object"]).columns)
  #Time Data to be removed
  del_col=list(o_id1_train.select_dtypes(include = ["datetime64[ns]"]).columns)
  
  #Removing Time Data to Target Encode without errors
  for i in del_col:
    del o_id1_train[i]
    del o_id1_test[i]
  
  #Data to Normalise
  coll = list(o_id1_train.select_dtypes(exclude=["object"]).columns)

  #Saving Indices to set them again while concatinating, as encoding removes them.
  train_index=o_id1_train.index
  test_index=o_id1_test.index
  
  #Encoding and Normalising
  tenc = TargetEncoder().fit(X=o_id1_train[col],y=y_id1)
  scaler = StandardScaler()
  
  tencoded_train = tenc.transform(o_id1_train[col])
  tencoded_train = pd.DataFrame(tencoded_train).set_index(train_index)
  tencoded_test = tenc.transform(o_id1_test[col])
  tencoded_test = pd.DataFrame(tencoded_test).set_index(test_index)
  
  scaled_train = scaler.fit_transform(o_id1_train[coll])
  scaled_test = scaler.transform(o_id1_test[coll])  

  scaled_train = pd.DataFrame(scaled_train).set_index(train_index)
  scaled_test = pd.DataFrame(scaled_test).set_index(test_index)

  for i in range(np.shape(scaled_train)[1]):
    scaled_train.rename(columns={i:coll[i]},inplace=True)
  for i in range(np.shape(scaled_test)[1]):
    scaled_test.rename(columns={i:coll[i]},inplace=True)
  for i in range(np.shape(tencoded_train)[1]):
    tencoded_train.rename(columns={i:col[i]},inplace=True)

  #Columns to be removed from original data after encoding so that they aren't duplicated while concatenating
  rem=np.append(col,coll)
  train_enc = pd.concat([tencoded_train, scaled_train, o_id1_train.drop(rem, axis = 1)], axis = 1)
  test_enc = pd.concat([tencoded_test, scaled_test, o_id1_test.drop(rem, axis = 1)], axis = 1)
  return train_enc,test_enc,col,coll



In [0]:
train_enc1,test_enc1,col1,coll1=t_encode_func(o_id1_train,y_id1,o_id1_test)
train_enc4,test_enc4,col4,coll4=t_encode_func(o_id4_train,y_id4,o_id4_test)
train_enc8,test_enc8,col8,coll8=t_encode_func(o_id8_train,y_id8,o_id8_test)

In [0]:
train_enc1

Unnamed: 0_level_0,query_identifier,query_detected_language,application_type,item_type,app_version,app_lang,country_by_ip,timezone_by_ip,algorithm_class,cbf_parser,search_title,search_keywords,search_abstract,query_word_count,query_char_count,query_document_id,year_published,number_of_authors,first_author_id,num_pubs_by_first_author,hour_request_received,rec_processing_time,local_hour_of_request,number_of_recs_in_set,recommendation_algorithm_id_used,clicks,ctr
recommendation_set_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
46897,0.020522,0.020818,0.0205,0.020522,0.02126,0.02028,0.016440,0.040000,0.021581,0.020421,0.021581,0.020030,0.020486,-0.569633,-0.841428,-0.420324,0.131344,-0.095050,-0.357533,-0.030963,0.506569,-0.063429,0.185806,0.136060,-1.108501,-0.116691,-0.114246
46898,0.020522,0.020818,0.0205,0.020522,0.02126,0.02028,0.022007,0.060510,0.021581,0.020421,0.021581,0.020030,0.020486,0.015851,-0.346677,-0.420324,0.131344,-0.095050,-0.357533,-0.030963,0.506569,-0.024838,0.185806,0.136060,-1.108501,-0.116691,-0.114246
46899,0.020522,0.020818,0.0205,0.020522,0.02126,0.02028,0.011111,0.040698,0.021581,0.020134,0.021581,0.021358,0.020486,1.577140,1.577354,-0.420324,0.131344,-0.095050,-0.357533,-0.030963,0.506569,-0.003752,0.185806,0.136060,-0.989059,-0.116691,-0.114246
46902,0.020522,0.020818,0.0205,0.020522,0.02126,0.02028,0.019001,0.021352,0.021581,0.020421,0.021581,0.020030,0.020486,0.991656,0.945172,-0.420324,0.131344,-0.095050,-0.357533,-0.030963,0.506569,-0.030502,0.185806,0.136060,-1.108501,-0.116691,-0.114246
46903,0.020522,0.020818,0.0205,0.020522,0.02126,0.02028,0.019001,0.021352,0.021581,0.020134,0.021581,0.021358,0.020486,0.991656,0.945172,-0.420324,0.131344,-0.095050,-0.357533,-0.030963,0.506569,-0.030502,0.185806,0.136060,-0.989059,-0.116691,-0.114246
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
459523,0.020522,0.020818,0.0205,0.020522,0.02126,0.02028,0.017826,0.017709,0.021581,0.020134,0.021581,0.021358,0.020486,0.015851,0.175560,-0.420324,0.131344,-0.095050,-0.357533,-0.030963,0.328679,-0.038087,0.185806,0.136060,-0.989059,-0.116691,-0.114246
459524,0.020522,0.020818,0.0205,0.020522,0.02126,0.02028,0.021629,0.019479,0.021581,0.020134,0.021581,0.020030,0.020486,0.796495,1.549868,1.136340,0.131344,-0.095050,-0.357533,-0.030963,0.328679,-0.027656,-0.903642,0.136060,0.085923,-0.116691,-0.114246
459526,0.020522,0.020818,0.0205,0.020522,0.02126,0.02028,0.018002,0.027344,0.018706,0.020134,0.017852,0.020030,0.020486,-0.569633,-0.649025,-0.420324,0.131344,-0.095050,-0.357533,-0.030963,0.328679,-0.060185,0.403696,0.136060,1.280347,-0.116691,-0.114246
459532,0.020522,0.021521,0.0205,0.020522,0.02126,0.02028,0.017826,0.017709,0.021581,0.027594,0.021581,0.021358,0.020486,-1.350277,-1.391151,-0.015933,0.131344,0.597145,0.585963,-0.027343,0.328679,-0.050281,0.185806,-7.333284,-0.152962,-0.116691,-0.114246


# Resampling

In [0]:
from imblearn.combine import SMOTEENN
from collections import Counter
sme = SMOTEENN(sampling_strategy='auto',random_state=42)
X_res1, y_res1 = sme.fit_resample(train_enc1, y_id1)
print('\n Original Dataset Shape_01 %s'%Counter(y_id1))
print('\n Resampled dataset Shape_01 %s' %Counter(y_res1))
X_res4, y_res4 = sme.fit_resample(train_enc4, y_id4)
print('\n Original Dataset Shape_04 %s'%Counter(y_id4))
print('\n Resampled dataset Shape_04 %s' % Counter(y_res4))
Xkaggleer(y_id8))
print('\n Resampled dataset shape_08 %s' % Counter(y_res8))
print('\n Resampling Success \n')




 Original Dataset Shape_01 Counter({0: 264700, 1: 5546})

 Resampled dataset Shape_01 Counter({1: 264700, 0: 264698})

 Original Dataset Shape_04 Counter({0: 99157, 1: 1058})

 Resampled dataset Shape_04 Counter({0: 99157, 1: 99157})

 Original Dataset Shape_08 Counter({0: 15067, 1: 159})

 Resampled dataset shape_08 Counter({0: 15067, 1: 15067})

 Resampling Success 



In [0]:
pd.DataFrame(X_res1).to_csv('folder/OID_01_Xtrain.csv')
pd.DataFrame(X_res4).to_csv('folder/OID_04_Xtrain.csv')
pd.DataFrame(X_res8).to_csv('folder/OID_08_Xtrain.csv')
pd.DataFrame(y_res1).to_csv('folder/OID_01_ytrain.csv')
pd.DataFrame(y_res4).to_csv('folder/OID_04_ytrain.csv')
pd.DataFrame(y_res8).to_csv('folder/OID_08_ytrain.csv')
pd.DataFrame(test_enc1).to_csv('folder/OID_01_Xtest.csv')
pd.DataFrame(test_enc4).to_csv('folder/OID_04_Xtest.csv')
pd.DataFrame(test_enc8).to_csv('folder/OID_08_Xtest.csv')

In [0]:
# from imblearn.over_sampling import ADASYN
# ada = ADASYN(sampling_strategy='auto', random_state=42)
# X_res,y_res=ada.fit_resample(o_id1_train,y_id1)

# CatBoostClassifier

In [0]:
# saving to Google Drive
pd.DataFrame(X_res1).to_csv(path+'folder03/OID_01_Xtrain.csv')
pd.DataFrame(X_res4).to_csv(path+'folder03/OID_04_Xtrain.csv')
pd.DataFrame(X_res8).to_csv(path+'folder03/OID_08_Xtrain.csv')
pd.DataFrame(y_res1).to_csv(path+'folder03/OID_01_ytrain.csv')
pd.DataFrame(y_res4).to_csv(path+'folder03/OID_04_ytrain.csv')
pd.DataFrame(y_res8).to_csv(path+'folder03/OID_08_ytrain.csv')
pd.DataFrame(test_enc1).to_csv(path+'folder03/OID_01_Xtest.csv')
pd.DataFrame(test_enc4).to_csv(path+'folder03/OID_04_Xtest.csv')
pd.DataFrame(test_enc8).to_csv(path+'folder03/OID_08_Xtest.csv')

In [0]:
X_res1=pd.read_csv()
X_res4
X_res8
y_res1
y_res4
y_res8

TypeError: ignored

In [0]:
col1

['query_identifier',
 'query_detected_language',
 'application_type',
 'item_type',
 'app_version',
 'app_lang',
 'country_by_ip',
 'timezone_by_ip',
 'algorithm_class',
 'cbf_parser',
 'search_title',
 'search_keywords',
 'search_abstract']

In [0]:
coll1

['query_word_count',
 'query_char_count',
 'query_document_id',
 'year_published',
 'number_of_authors',
 'first_author_id',
 'num_pubs_by_first_author',
 'hour_request_received',
 'rec_processing_time',
 'local_hour_of_request',
 'number_of_recs_in_set',
 'recommendation_algorithm_id_used',
 'clicks',
 'ctr']

In [0]:
X_res1=pd.read_csv(path+'folder03/OID_01_Xtrain.csv',index_col=0)
X_res4=pd.read_csv(path+'folder03/OID_04_Xtrain.csv',index_col=0)
X_res8=pd.read_csv(path+'folder03/OID_08_Xtrain.csv',index_col=0)
y_res1=pd.read_csv(path+'folder03/OID_01_ytrain.csv')
y_res4=pd.read_csv(path+'folder03/OID_04_ytrain.csv')
y_res8=pd.read_csv(path+'folder03/OID_08_ytrain.csv')
test_enc1=pd.read_csv(path+'folder03/OID_01_Xtest.csv',index_col=0)
test_enc4=pd.read_csv(path+'folder03/OID_04_Xtest.csv',index_col=0)
test_enc8=pd.read_csv(path+'folder03/OID_08_Xtest.csv',index_col=0)

In [0]:
del X_res1['Unnamed: 0.1']

In [0]:
X_res1

Unnamed: 0,query_identifier,query_detected_language,application_type,item_type,app_version,app_lang,country_by_ip,timezone_by_ip,algorithm_class,cbf_parser,search_title,search_keywords,search_abstract,query_word_count,query_char_count,query_document_id,year_published,number_of_authors,first_author_id,num_pubs_by_first_author,hour_request_received,rec_processing_time,local_hour_of_request,number_of_recs_in_set,recommendation_algorithm_id_used,clicks,ctr
0,0.020522,0.020818,0.0205,0.020522,0.021260,0.02028,0.016440,0.040000,0.021581,0.020421,0.021581,0.020030,0.020486,-0.569633,-0.841428,-0.420324,0.131344,-0.095050,-0.357533,-0.030963,0.506569,-0.063429,0.185806,0.13606,-1.108501,-0.116691,-0.114246
1,0.020522,0.020818,0.0205,0.020522,0.021260,0.02028,0.022007,0.060510,0.021581,0.020421,0.021581,0.020030,0.020486,0.015851,-0.346677,-0.420324,0.131344,-0.095050,-0.357533,-0.030963,0.506569,-0.024838,0.185806,0.13606,-1.108501,-0.116691,-0.114246
2,0.020522,0.020818,0.0205,0.020522,0.021260,0.02028,0.011111,0.040698,0.021581,0.020134,0.021581,0.021358,0.020486,1.577140,1.577354,-0.420324,0.131344,-0.095050,-0.357533,-0.030963,0.506569,-0.003752,0.185806,0.13606,-0.989059,-0.116691,-0.114246
3,0.020522,0.020818,0.0205,0.020522,0.021260,0.02028,0.019001,0.021352,0.021581,0.020421,0.021581,0.020030,0.020486,0.991656,0.945172,-0.420324,0.131344,-0.095050,-0.357533,-0.030963,0.506569,-0.030502,0.185806,0.13606,-1.108501,-0.116691,-0.114246
4,0.020522,0.020818,0.0205,0.020522,0.021260,0.02028,0.019001,0.021352,0.021581,0.020134,0.021581,0.021358,0.020486,0.991656,0.945172,-0.420324,0.131344,-0.095050,-0.357533,-0.030963,0.506569,-0.030502,0.185806,0.13606,-0.989059,-0.116691,-0.114246
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
529393,0.020522,0.020818,0.0205,0.020522,0.021260,0.02028,0.018329,0.017966,0.021581,0.020421,0.021581,0.020030,0.020486,-0.931288,-0.919849,2.653580,-0.097484,-0.084882,-0.311314,0.038759,0.189982,-0.053900,0.015924,0.13606,-1.108501,3.680901,3.487742
529394,0.020522,0.018113,0.0205,0.020522,0.018995,0.02028,0.017826,0.017709,0.024917,0.020134,0.018767,0.020355,0.020486,-0.374472,-0.588895,-0.420324,0.131344,-0.095050,-0.357533,-0.030963,0.016516,-0.033522,0.185806,0.13606,-0.989059,7.478493,7.087208
529395,0.020522,0.020818,0.0205,0.020522,0.016495,0.02028,0.021299,0.026334,0.026001,0.020134,0.017852,0.020030,0.020486,0.991656,0.605928,-0.420324,0.131344,-0.095050,-0.357533,-0.030963,-0.112043,-0.006774,-0.467863,0.13606,-0.989059,3.680901,3.487742
529396,0.020522,0.020818,0.0205,0.020522,0.021260,0.02028,0.017826,0.017709,0.021581,0.020421,0.021581,0.021358,0.020581,0.664971,0.451020,-0.420324,0.131344,-0.095050,-0.357533,-0.030963,0.328679,0.169657,0.185806,0.13606,-0.311351,3.680901,3.487742


In [0]:
test_enc1

Unnamed: 0_level_0,query_identifier,query_detected_language,application_type,item_type,app_version,app_lang,country_by_ip,timezone_by_ip,algorithm_class,cbf_parser,search_title,search_keywords,search_abstract,query_word_count,query_char_count,query_document_id,year_published,number_of_authors,first_author_id,num_pubs_by_first_author,hour_request_received,rec_processing_time,local_hour_of_request,number_of_recs_in_set,recommendation_algorithm_id_used,clicks,ctr
recommendation_set_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
46914.0,0.020522,0.020818,0.0205,0.020522,0.013165,0.020280,0.019001,0.021352,0.021581,0.020421,0.021581,0.020030,0.020486,1.772301,1.659812,-0.420324,0.131344,-0.095050,-0.357533,-0.030963,0.506569,0.073678,0.185806,0.13606,-1.108501,-0.116691,-0.114246
46961.0,0.020522,0.020818,0.0205,0.020522,0.021260,0.020280,0.017826,0.017709,0.021581,0.020421,0.021581,0.020030,0.020486,-0.764794,-0.154274,-0.420324,0.131344,-0.095050,-0.357533,-0.030963,0.684460,0.073678,0.185806,0.13606,-1.108501,-0.116691,-0.114246
47012.0,0.020522,0.020818,0.0205,0.020522,0.021260,0.020280,0.015562,0.032258,0.021581,0.020134,0.021581,0.021358,0.020486,-0.179310,-0.319191,-0.420324,0.131344,-0.095050,-0.357533,-0.030963,0.684460,0.073678,0.185806,0.13606,-0.989059,-0.116691,-0.114246
47181.0,0.020522,0.020818,0.0205,0.020522,0.021260,0.020280,0.017826,0.020079,0.021581,0.020421,0.021581,0.020030,0.020486,0.015851,-0.181760,0.038386,0.182874,0.320267,0.969005,-0.030684,1.573914,0.073678,0.185806,0.13606,-1.108501,-0.116691,-0.114246
47240.0,0.020522,0.020818,0.0205,0.020522,0.021260,0.020280,0.021629,0.018051,0.021581,0.020421,0.021581,0.020030,0.020486,0.211012,0.422935,-0.420324,0.131344,-0.095050,-0.357533,-0.030963,-2.339683,0.073678,0.185806,0.13606,-1.108501,-0.116691,-0.114246
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
459307.0,0.020522,0.017234,0.0205,0.020522,0.065495,0.017084,0.017826,0.017709,0.021581,0.020134,0.021581,0.020030,0.020626,0.796495,1.577354,-0.380979,0.028284,-0.095050,-0.220687,-0.030592,-0.027103,0.073678,-0.249973,0.13606,0.444250,-0.116691,-0.114246
459313.0,0.020522,0.020818,0.0205,0.020522,0.021260,0.020280,0.015562,0.017709,0.021581,0.020134,0.021581,0.021358,0.020626,-0.179310,-0.456622,-0.420324,0.131344,-0.095050,-0.357533,-0.030963,-0.027103,0.073678,-0.249973,0.13606,0.205365,-0.116691,-0.114246
459386.0,0.020522,0.020818,0.0205,0.020522,0.021260,0.020280,0.017826,0.017709,0.021581,0.020134,0.021581,0.020030,0.020486,-0.569633,-0.456622,-0.420324,0.131344,-0.095050,-0.357533,-0.030963,0.150788,0.073678,-0.032084,0.13606,0.085923,-0.116691,-0.114246
459432.0,0.020522,0.020818,0.0205,0.020522,0.021260,0.020280,0.011111,0.010797,0.018706,0.020134,0.017852,0.020030,0.020486,-1.740599,-1.583554,-0.420324,0.131344,-0.095050,-0.357533,-0.030963,0.150788,0.073678,1.493144,0.13606,1.997002,-0.116691,-0.114246


In [0]:
print(list(X_res1.columns)==list(test_enc1.columns))
print(list(X_res4.columns)==list(test_enc4.columns))
print(list(X_res8.columns)==list(test_enc8.columns))

False
False
False


In [0]:
list(X_res1.columns)

['Unnamed: 0',
 'Unnamed: 0.1',
 'query_identifier',
 'query_detected_language',
 'application_type',
 'item_type',
 'app_version',
 'app_lang',
 'country_by_ip',
 'timezone_by_ip',
 'algorithm_class',
 'cbf_parser',
 'search_title',
 'search_keywords',
 'search_abstract',
 'query_word_count',
 'query_char_count',
 'query_document_id',
 'year_published',
 'number_of_authors',
 'first_author_id',
 'num_pubs_by_first_author',
 'hour_request_received',
 'rec_processing_time',
 'local_hour_of_request',
 'number_of_recs_in_set',
 'recommendation_algorithm_id_used',
 'clicks',
 'ctr']

In [0]:
list(test_enc1.columns)

['recommendation_set_id',
 'query_identifier',
 'query_detected_language',
 'application_type',
 'item_type',
 'app_version',
 'app_lang',
 'country_by_ip',
 'timezone_by_ip',
 'algorithm_class',
 'cbf_parser',
 'search_title',
 'search_keywords',
 'search_abstract',
 'query_word_count',
 'query_char_count',
 'query_document_id',
 'year_published',
 'number_of_authors',
 'first_author_id',
 'num_pubs_by_first_author',
 'hour_request_received',
 'rec_processing_time',
 'local_hour_of_request',
 'number_of_recs_in_set',
 'recommendation_algorithm_id_used',
 'clicks',
 'ctr']

In [0]:
def clean_colname(col1,coll1,testo):
  for i in range(len(col1)):
    testo.rename(columns={str(i):col1[i]},inplace=True)
  for j in range(i,len(coll1)+i):
    testo.rename(columns={str(j+1):coll1[j-i]},inplace=True)
clean_colname(col1,coll1,X_res1)
clean_colname(col4,coll4,X_res4)
clean_colname(col8,coll8,X_res8)

In [0]:
y_res1.rename(columns={'0':'set_clicked'},inplace=True)
y_res4.rename(columns={'0':'set_clicked'},inplace=True)
y_res8.rename(columns={'0':'set_clicked'},inplace=True)