In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import VotingClassifier

## 0 Pre-precess Data

#### 0.1 Training Data

In [68]:
# load data
train_data = pd.read_csv('./TrainOnMe-4.csv', index_col=0)

# delete nan
train_data = train_data.dropna()

# transform text to number
labels = train_data.y.unique()
labels.sort()
x7 = train_data.x7.unique()
x7.sort()
x12 = train_data.x12.unique()
x12.sort()

# replace 'Atsuto' with 0, 'Bob' with 1, 'Jorg' with 2
for i in range(len(labels)):
    train_data.loc[train_data['y'] == labels[i], 'y'] = i

# Correct 'chottis' and 'olka'
train_data.loc[train_data['x7'] == 'chottis', 'x7'] = 'Schottis'
train_data.loc[train_data['x7'] == 'olka', 'x7'] = 'Polka'
# replace x7
x7 = x7[:5]
for i in range(len(x7)):
    train_data.loc[train_data['x7'] == x7[i], 'x7'] = i

# replace 'False' or 'Nope False' with 0, 'True' or 'YEP True' with 1
train_data.loc[train_data['x12'] == x12[0], 'x12'] = 0
train_data.loc[train_data['x12'] == x12[1], 'x12'] = 0
train_data.loc[train_data['x12'] == x12[2], 'x12'] = 1
train_data.loc[train_data['x12'] == x12[3], 'x12'] = 1

# delete extreme elements
train_data = train_data.drop(train_data.index[57])
train_data = train_data.drop(train_data.index[197])
train_data = train_data.drop(train_data.index[953])
train_data = np.array(train_data)   # (996, 14), object
train_data[:, 4] = np.array([float(train_data[:, 4][i]) for i in range(len(train_data[:, 4]))])

# define dataset X and label y
X = train_data[:, 1:]
y = train_data[:, 0]
y = y.astype('int')
print(X[0], X[0][3], X[0][4], X[0][7], X[0][8], X[0][9], X[0][10], X.shape)

# delete irrelevant features
clf = GradientBoostingClassifier(learning_rate=0.05, n_estimators=300, max_depth=4, max_features='sqrt')
clf.fit(X, y)
model = SelectFromModel(clf, prefit=True)
X = model.transform(X)
print(X[0], X.shape)

# standardize data
scaler = StandardScaler()
X = scaler.fit_transform(X)
print('selected features: x[3, 4, 7, 8, 9, 10]')

[-0.21695 0.88123 -4.95144 -2.3789 2.18407 0.84635 4 1.70754 0.44414
 -9.29924 -1.26958 1 -3.56049] -2.3789 2.18407 1.70754 0.44414 -9.29924 -1.26958 (996, 13)
[-2.3789 2.18407 1.70754 0.44414 -9.29924 -1.26958] (996, 6)
selected features: x[3, 4, 7, 8, 9, 10]


#### 0.2 Testing Data

In [69]:
eval_data = pd.read_csv('./EvaluateOnMe-4.csv', index_col=0)
eval_data = eval_data.drop(eval_data.columns[[0, 1, 2, 5, 6, 11, 12]], axis=1)
eval_data = np.array(eval_data)
scaler = StandardScaler()
eval_data = scaler.fit_transform(eval_data)
print(eval_data.shape)

(10000, 6)


## 1 Train Model

In [72]:
clf1 = GradientBoostingClassifier(learning_rate=0.05, n_estimators=300, max_depth=4, max_features='sqrt')
clf2 = RandomForestClassifier(n_estimators=300, max_depth=6, max_features='sqrt')
clf3 = ExtraTreesClassifier(n_estimators=300, max_depth=10, max_features='sqrt')
clf4 = SVC(kernel='rbf', gamma='auto', C=0.8, probability=True)

eclf = VotingClassifier(estimators=[('gbc1', clf1), ('gbc2', clf1), ('gbc3', clf1),('rfc1', clf2), ('rfc2', clf2),('rfc3', clf2), ('etc1', clf3), ('etc2', clf3), ('svc1', clf4), ('svc2', clf4), ('svc3', clf4), ('svc4', clf4), ('svc5', clf4), ('svc6', clf4)], voting='soft')

score = cross_val_score(eclf, X, y, cv=4)
print('Ensemble:', score, np.mean(score))

Ensemble: [0.77911647 0.81927711 0.79518072 0.79116466] 0.7961847389558233


## 2 Test Model

In [73]:
epoch = 50
y_pre = np.zeros((epoch, eval_data.shape[0], 3))
for ii in range(epoch):
    eclf.fit(X, y)
    y_pre[ii] = eclf.predict_proba(eval_data)
y_pre = np.mean(y_pre, axis=0)
prob_pre = np.max(y_pre, axis=1)
label_pre = np.argmax(y_pre, axis=1)

y_pre1 = np.zeros((epoch, eval_data.shape[0], 3))
for ii in range(epoch):
    eclf.fit(X, y)
    y_pre1[ii] = eclf.predict_proba(eval_data)
y_pre1 = np.mean(y_pre1, axis=0)
prob_pre1 = np.max(y_pre1, axis=1)
label_pre1 = np.argmax(y_pre1, axis=1)

In [74]:
cnt = 0
diff = []
for i in range(eval_data.shape[0]):
    if label_pre[i] == label_pre1[i]:
        cnt+=1
    else:
        diff.append(i)
print(cnt, len(diff))

label_out = label_pre
for ii in range(eval_data.shape[0]):
    if prob_pre1[ii] > prob_pre[ii]:
        label_out[ii] = label_pre1[ii]

9994 6


In [75]:
pre = label_out
pre = np.ndarray.tolist(pre)
with open('pre.txt', 'w') as f:
    for ii in pre:
        if ii == 0:
            f.write('Atsuto' + '\n')
        if ii == 1:
            f.write('Bob' + '\n')
        if ii == 2:
            f.write('Jorg' + '\n')