# Import all necessary libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import log_loss

from scipy.stats import uniform

from matplotlib import pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Read the input data and print df partly

In [None]:
df_train = pd.read_csv("/kaggle/input/a-tour-of-machine-learning-2020/data_train.csv")
print(df_train.head())
df_test = pd.read_csv("/kaggle/input/a-tour-of-machine-learning-2020/data_test.csv")
print(df_test.head())

# Prepare input files
Prepare the input matrices where we remove the GeneId and target vector. GeneId might be wrongly interpreted as numbers, so make sure these are strings with the map function.

In [None]:
train_GI = list(map(str,df_train.pop("GeneId")))
y_train = df_train.pop("Label")
X_train = df_train

test_GI = list(map(str,df_test.pop("GeneId")))
X_test = df_test

# Validation set
Make a validation subset

In [None]:
X_train_sub, X_valid, y_train_sub, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Train the model
Fit a model with random hyperparameter optimization.

In [None]:
n_jobs = 4
n_iter_params = 100
scoring = "neg_log_loss"

params = {
        "C" : uniform(0.001,100.0),
        "fit_intercept" : [True, False],
        "penalty" : ["l1", "l2", "elasticnet", "none"]
    }

model = LogisticRegression()

rscv_obj = RandomizedSearchCV(
                model, 
                params, 
                verbose=0,
                n_jobs=n_jobs,
                n_iter=n_iter_params,
                scoring=scoring,
                refit=True
            )

rscv_obj.fit(X_train_sub,y_train_sub)

# Get predictions

In [None]:
pred_train_sub = rscv_obj.predict_proba(X_train_sub)
pred_valid = rscv_obj.predict_proba(X_valid)

# Get evaluation of predictions

In [None]:
print("Logloss on training data: ",log_loss(y_train_sub,pred_train_sub))
print("Logloss on testing data: ",log_loss(y_valid,pred_valid))

# Add a new feature

In [None]:
X_train_sub["sum_all"] = X_train_sub.sum(axis=1)
X_valid["sum_all"] = X_valid.sum(axis=1)

print(X_train_sub.head())

In [None]:
plt.hist(X_train_sub["sum_all"][y_train_sub == 1],np.arange(0,5000,10),label="Class 1")
plt.hist(X_train_sub["sum_all"][y_train_sub == 0],np.arange(0,5000,10),label="Class 0")
plt.legend()
plt.show()

plt.hist(X_train_sub["sum_all"][y_train_sub == 0],np.arange(0,5000,10),label="Class 0")
plt.hist(X_train_sub["sum_all"][y_train_sub == 1],np.arange(0,5000,10),label="Class 1")
plt.legend()
plt.show()

# Train the model
Fit a model with random hyperparameter optimization and the new sum feature.

In [None]:
n_jobs = 4
n_iter_params = 100
scoring = "neg_log_loss"

params = {
        "C" : uniform(0.001,100.0),
        "fit_intercept" : [True, False],
        "penalty" : ["l1", "l2", "elasticnet", "none"]
    }

model = LogisticRegression()

rscv_obj = RandomizedSearchCV(
                model, 
                params, 
                verbose=1,
                n_jobs=n_jobs,
                n_iter=n_iter_params,
                scoring=scoring,
                refit=True
            )

rscv_obj.fit(X_train_sub,y_train_sub)

# Get predictions

In [None]:
pred_train_sub = rscv_obj.predict_proba(X_train_sub)
pred_valid = rscv_obj.predict_proba(X_valid)

# Get evaluation of predictions

In [None]:
print("Logloss on training data: ",log_loss(y_train_sub,pred_train_sub))
print("Logloss on testing data: ",log_loss(y_valid,pred_valid))

In [None]:
def get_sum_single(df):
    temp_dict = {}
    for i in range(0,5):
        name = df.columns[i].split("_")[0]
        temp_dict[name] = df.iloc[:,range(i,len(df.columns),5)].sum(axis=1)
    return pd.DataFrame(temp_dict)

def get_sum_region(df):
    temp_dict = {}
    for i in range(0,98,5):
        name = df.columns[i].split("_")[1]
        temp_dict[name] = df.iloc[:,range(i*5,i*5+5,1)].sum(axis=1)
    return pd.DataFrame(temp_dict)

In [None]:
X_train_sub_single = get_sum_single(X_train_sub) #od.concat([,X_train_sub],axis=1)
X_train_sub_region = get_sum_region(X_train_sub) 
X_train_sub_min = pd.concat([X_train_sub_single,X_train_sub_region,X_train_sub["sum_all"]],axis=1)

X_valid_single = get_sum_single(X_valid) #od.concat([,X_train_sub],axis=1)
X_valid_region = get_sum_region(X_valid) 
X_valid_min = pd.concat([X_valid_single,X_valid_region,X_valid["sum_all"]],axis=1)

In [None]:
n_jobs = 4
n_iter_params = 100
scoring = "neg_log_loss"

params = {
        "C" : uniform(0.001,100.0),
        "fit_intercept" : [True, False],
        "penalty" : ["l1", "l2", "elasticnet", "none"]
    }

model = LogisticRegression()

rscv_obj = RandomizedSearchCV(
                model, 
                params, 
                verbose=1,
                n_jobs=n_jobs,
                n_iter=n_iter_params,
                scoring=scoring,
                refit=True
            )

rscv_obj.fit(X_train_sub_min,y_train_sub)

In [None]:
pred_train_sub = rscv_obj.predict_proba(X_train_sub_min)
pred_valid = rscv_obj.predict_proba(X_valid_min)

In [None]:
print("Logloss on training data: ",log_loss(y_train_sub,pred_train_sub))
print("Logloss on testing data: ",log_loss(y_valid,pred_valid))

# Generate the submission predictions

In [None]:
X_train["sum_all"] = X_train.sum(axis=1)
X_train_single = get_sum_single(X_train) #od.concat([,X_train_sub],axis=1)
X_train_region = get_sum_region(X_train) 
X_train_min = pd.concat([X_train_single,X_train_region,X_train["sum_all"]],axis=1)

model = LogisticRegression(**rscv_obj.best_params_)
model.fit(X_train,y_train)

In [None]:
X_test["sum_all"] = X_test.sum(axis=1)
X_test_single = get_sum_single(X_test) #od.concat([,X_train_sub],axis=1)
X_test_region = get_sum_region(X_test) 
X_test_min = pd.concat([X_test_single,X_test_region,X_test["sum_all"]],axis=1)

pred_test = model.predict_proba(X_test)

In [None]:
df_submit = pd.DataFrame([test_GI,pred_test[:,1]],index=["GeneId","Label"]).T

In [None]:
df_submit.to_csv("submission.csv",index=False)