In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, KFold

In [72]:
# Reading the CSV files
train_data_csv = "TrainOnMe.csv"
df = pd.read_csv(train_data_csv)
scaler = StandardScaler()

eval_data_csv = "EvaluateOnMe.csv"
eval_df = pd.read_csv(eval_data_csv)

In [73]:
# Correcting misspelled labels, values and 
df["y"].replace(["ragspel","yckelharpa","erpent"],["Dragspel","Nyckelharpa","Serpent"],inplace=True)
df["x11"].replace(["Tru","F"],["True","False"],inplace=True)
df["x12"].replace(["Flase","F"],"False",inplace=True)
df["x6"].replace(["Ostra stationen",np.nan],["Östra stationen","Unknown"],inplace=True)
df['x5'] = df['x5'].astype(str).replace(['0.0','-0.0'],[0,1]).astype(float) # Rewriting -0 and 0
df = df.loc[(df['x5']== 0) | (df['x5'] == 1)]   # For x5

# Translating True and False, and 0 and -0 to ints
lbe = LabelEncoder()
df['x11'] = lbe.fit_transform(df['x11'])
eval_df['x11'] = lbe.fit_transform(eval_df['x11'])
df['x12'] = lbe.fit_transform(df['x12'])
df['x5'] = lbe.fit_transform(df['x5'])
#df['x6'] = lbe.fit_transform(df['x6'])  # Not used, Could be done with LabelEncoder or OneHotEncoder

labels = df["y"].reset_index(drop=True) # Collecting labels
df = df.drop(columns='y')   # Removing the labels from the training data

#Scaling on numbers-only columns
scaler = StandardScaler()
df[["x1","x2","x3","x4","x7","x8","x9","x10",'x13']] = scaler.fit_transform(df[["x1","x2","x3","x4","x7","x8","x9","x10",'x13']])
eval_df[['x1','x2','x3','x4','x7','x8','x9','x10','x13']] = scaler.transform(eval_df[['x1','x2','x3','x4','x7','x8','x9','x10','x13']])
training_data = df[['x1','x2','x3', 'x4','x7','x8', 'x9', 'x10','x11','x13']]
eval_data = eval_df[['x1','x2','x3', 'x4','x7','x8', 'x9', 'x10','x11','x13']]
eval_data

Unnamed: 0,x1,x2,x3,x4,x7,x8,x9,x10,x11,x13
0,-1.274876,1.914052,-0.515880,1.276361,-2.105815,1.346296,0.171269,0.291259,1,1.274871
1,-0.090712,1.099290,-0.055784,0.092737,-0.540173,0.846386,-0.263443,-0.832786,1,0.090718
2,-1.450261,0.838829,1.090436,1.449807,-2.180773,2.031808,-0.784332,0.914315,0,1.450266
3,-2.830652,-0.092696,-0.468880,2.829023,-3.153238,2.459801,-0.666523,-0.351724,0,2.830652
4,1.303778,-1.327479,-0.356170,-1.304173,2.290736,-1.249464,0.428330,-0.086778,1,-1.303783
...,...,...,...,...,...,...,...,...,...,...
9995,0.963308,0.856488,-0.556292,-0.964512,0.452466,-1.569565,1.041661,0.143232,1,-0.963313
9996,-0.185536,0.334457,-1.441810,0.183962,-0.455423,-0.031864,0.107772,-1.306767,1,0.185536
9997,1.801704,-0.479538,-0.384701,-1.800832,1.195918,-1.187880,-1.174123,1.017397,1,-1.801704
9998,1.070886,0.790135,-0.647139,-1.070995,1.432146,-0.443671,0.282312,1.319688,1,-1.070886


In [None]:
from stacking import stacking_eval, stacking_get_labels
#from sklearn.preprocessing import LabelEncoder
def evaluate_model(selected_data,labels):
    X_train,X_test,y_train,y_test = train_test_split(selected_data,labels,train_size=0.8,stratify=labels)   # Assuming evaluation data has same proportion on the labels as training
    # kf = KFold(n_splits=5,shuffle=True,random_state=50)
    # split = kf.split(selected_data,labels)
    score_list = []

    # For-loop used for K-fold cross-validation
    # for train_ind, test_ind in split:
    #     X_train, X_test = selected_data.iloc[train_ind], selected_data.iloc[test_ind]
    #     y_train, y_test = labels[train_ind], labels[test_ind]
    #     score_list.append(stacking(X_train,X_test,y_train,y_test))
    score_list = stacking_eval(X_train,X_test,y_train,y_test)    # Trains the model, can also be used for RandomSearchCV to get best_params
    #print(best)
    #print("All score: ",score_list)
    print("Mean: ",np.mean(score_list))
    #print("Std: ",np.std(score_list))

#le = LabelEncoder()
#evaluate_model(training_data,labels)   # Used for evaluating model

y_pred = stacking_get_labels(training_data,eval_data,labels)    # Used for classifying evaluation data

In [90]:
y_pred_df = pd.DataFrame(data=y_pred,columns=None)
y_pred_df.to_csv('final_labels.txt', sep = ',', index = False,header=False)


Dragspel   
Dragspel       0.594559
Nyckelharpa    0.215722
Serpent        0.189719
dtype: float64
Dragspel   
Dragspel       0.596360
Nyckelharpa    0.215022
Serpent        0.188619
dtype: float64
Dragspel   
Dragspel       0.595160
Nyckelharpa    0.216622
Serpent        0.188219
dtype: float64
