# 1. Preprocessing and handling the data

Before applying linear regression as a model on the data we're gonne preprocess and seperate the data and its' labels

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn import decomposition

In [2]:
data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')
# split train data
y = data['Lead']
X = data.loc[:, data.columns!='Lead'] # Can any other columns be dropped?
# FIXME: (potentially) this can be uneccessary since we already have two data sets of unlabeled examples
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Model
model = LogisticRegression()


# 2. Pipeline for gridsearch

In [3]:
skl_sc = StandardScaler()
pca = decomposition.PCA()

pipeline = Pipeline(steps=[('skl_sc', skl_sc),
                            ('pca', pca),
                            ('model', model)])

n_components = list(range(1,X_train.shape[1]+1,1))

# Hyperparamter C and penalty:
C = np.logspace(-4,4,50)
penalty = ['l2']

params = dict(pca__n_components=n_components, model__C=C, model__penalty=penalty)

# 3. Gridsearch and CV

In [4]:
gridSearch = GridSearchCV(pipeline, params, refit=True, n_jobs=6)
ans = gridSearch.fit(X_train, y_train)
y_pred_grid = gridSearch.predict(X_test)

In [5]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn import decomposition

skl_sc = StandardScaler()
pca = decomposition.PCA(n_components=gridSearch.best_estimator_.get_params()['pca__n_components'])
model = LogisticRegression(C=gridSearch.best_estimator_.get_params()['model__C'],penalty=gridSearch.best_estimator_.get_params()['model__penalty'])
pipeline = Pipeline(steps=[('skl_sc', skl_sc),
                            ('pca', pca),
                            ('model', model)])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

y_test = y_test.to_list()
assert len(y_test) == len(y_pred)

correct = 0
for i in range(len(y_pred)):
    if y_pred[i] == y_test[i]:
        correct += 1

print(correct/len(y_pred))

0.9086538461538461


# 4. Print the best result

In [6]:
print('Best Penalty:', gridSearch.best_estimator_.get_params()['model__penalty'])
print('Best C:', gridSearch.best_estimator_.get_params()['model__C'])
print('Best Number Of Components:', gridSearch.best_estimator_.get_params()['pca__n_components'])
print(); print(gridSearch.best_estimator_.get_params()['model'])

Best Penalty: l2
Best C: 35.564803062231285
Best Number Of Components: 12

LogisticRegression(C=35.564803062231285)


# Create CSV file

In [7]:
from utils import gen_csv_from_pred
y_pred_file = pipeline.predict(pd.read_csv('data/test.csv'))
gen_csv_from_pred(y_pred_file, "Logistic_regression")