In [7]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_columns', 500)

In [21]:
train_data = pd.read_csv('./preprocessed_train_data.csv', index_col=0)
test_data = pd.read_csv('./preprocessed_test_data.csv', index_col=0)

train_data = train_data.replace([np.inf, -np.inf], 0)
test_data = test_data.replace([np.inf, -np.inf], 0)
train_data = train_data.fillna(0) 
test_data = test_data.fillna(0)

In [22]:
# Method that normalizes the data
def NormalizeData(train, c_test):
    # Scailing the features
    sc = StandardScaler()
    scaled_columns = [col for col in train.columns[~train.columns.str.startswith('Cat_')]]
    train.loc[:, scaled_columns] = sc.fit_transform(train.loc[:, scaled_columns]) 
    c_test.loc[:, scaled_columns] = sc.transform(c_test.loc[:, scaled_columns])
    return train.values, c_test.values

In [23]:
#Y-train label 
y = train_data.label
train_data = train_data.drop(['label'], axis=1)
train_data = train_data.drop(['index'], axis=1)

#Shapes of the x-train and y-train
print('Train shape :', train_data.shape, 'Test shape', test_data.shape)
X, test_data = NormalizeData(train_data, test_data)
print('Scaled Train shape :', X.shape, 'Scaled Test shape', test_data.shape)

# Do the KFold
kf = KFold(n_splits=5)
for train_index, test_index in kf.split(X, y):

    X_train, X_valid = X[train_index], X[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    
    # Run the Logistic Regression CV
    model = LogisticRegressionCV(cv=3, scoring='f1_macro', multi_class='multinomial', max_iter=400, n_jobs=-1)
    print("Fitting the model...")
    model.fit(X_train, y_train)
    print("Done fitting")
    
    # Do the model prediciton
    train_predictions = model.predict(X_train)
    valid_predictions = model.predict(X_valid)
    
    print("F1 score on training set", f1_score(y_train, train_predictions, average="micro"))
    print("F1 score on validation set", f1_score(y_valid, valid_predictions, average="micro"))

Train shape : (80176, 249) Test shape (34021, 249)
Scaled Train shape : (80176, 249) Scaled Test shape (34021, 249)
Fitting the model...


KeyboardInterrupt: 