# Training Model

### 1. Import library

In [129]:
#pandas
import pandas as pd
from pandas import Series,DataFrame
pd.set_option('display.max_columns', 500)

# machine learning
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score

### 2. Prepare data

In [40]:
train_df = pd.read_csv('resource/train.csv')

In [41]:
def get_wilderness(wilderness_col):
    type1, type2, type3, type4 = wilderness_col
    if (type1) : return 1
    if (type2) : return 2
    if (type3) : return 3
    return 4
train_df['Wilderness'] = train_df[['Wilderness_Area1','Wilderness_Area2','Wilderness_Area3','Wilderness_Area4']].apply(get_wilderness,axis=1)

In [42]:
def get_soil_type(soil_type_col):
    for index, item in enumerate (soil_type_col):
        if (item): return (index + 1)
            

soil_type = []
for i in range(1,41):
    soil_type.append('Soil_Type'+str(i))

train_df['Soil_type'] = train_df[soil_type].apply(get_soil_type,axis=1)

In [151]:
train_df.drop('Id', axis=1)
train, dev = train_test_split(train_df, test_size = 0.01, random_state=1234)

In [152]:
#Original feature

train_original = train
train_original = train_original.drop('Wilderness', axis=1)
train_original = train_original.drop('Soil_type', axis=1)

test_original = dev
test_original = test_original.drop('Wilderness', axis=1)
test_original = test_original.drop('Soil_type', axis=1)

print 




In [45]:
#Combined feature

train_combined = train
test_combined = dev

for i in range(1, 5):
    train_combined = train_combined.drop('Wilderness_Area'+str(i), axis=1)
    test_combined = test_combined.drop('Wilderness_Area'+str(i), axis=1)
    
for i in range(1, 41):
    train_combined = train_combined.drop('Soil_Type'+str(i), axis=1)
    test_combined = test_combined.drop('Soil_Type'+str(i), axis=1)

In [103]:
#Nomalize feature
train_normalize = train
test_normalize = dev

temp_train_1 = train_normalize.ix[:, 1:11 ]
temp_test_1 = test_normalize.ix[:, 1:11]
temp_train_2 = train_normalize.ix[:, 11:56 ]
temp_test_2 = test_normalize.ix[:, 11:56]

normalize = Normalizer()
temp_train_1 = normalize.fit_transform(temp_train_1)
temp_test_1 = normalize.transform(temp_test_1)

train_normalize = numpy.concatenate((temp_train_1,temp_train_2),axis=1)
test_normalize = numpy.concatenate((temp_test_1,temp_test_2),axis=1)
# for i in label:
#     print (train_normalize[i].shape)
#     train_normalize[i] = normalize.fit_transform(train_normalize[i])
#     test_normalize[i] = normalize.transform(test_normalize[i])

### 3. Logistic Regression Model

#### a. Original Feature

In [24]:
y_train = train_original['Cover_Type']
x_train = train_original.drop('Cover_Type', axis=1)

y_test = test_original['Cover_Type']
x_test = test_original.drop('Cover_Type', axis=1)

classifier = LogisticRegression(random_state=1234)
classifier.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1234, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [25]:
y_pred = classifier.predict(x_test)

acc = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print "REPORT"
print report

REPORT
             precision    recall  f1-score   support

          1       0.63      0.56      0.59       416
          2       0.55      0.52      0.54       433
          3       0.57      0.36      0.44       424
          4       0.74      0.84      0.79       402
          5       0.61      0.72      0.66       482
          6       0.53      0.58      0.55       403
          7       0.84      0.93      0.88       464

avg / total       0.64      0.65      0.64      3024



#### b. Combined Feature

In [26]:
y_train = train_combined['Cover_Type']
x_train = train_combined.drop('Cover_Type', axis=1)

y_test = test_combined['Cover_Type']
x_test = test_combined.drop('Cover_Type', axis=1)

classifier = LogisticRegression(random_state=1234)
classifier.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1234, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [27]:
y_pred = classifier.predict(x_test)

acc = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print "REPORT"
print report

REPORT
             precision    recall  f1-score   support

          1       0.60      0.50      0.55       416
          2       0.53      0.48      0.51       433
          3       0.52      0.50      0.51       424
          4       0.74      0.85      0.79       402
          5       0.63      0.54      0.58       482
          6       0.53      0.62      0.57       403
          7       0.79      0.90      0.84       464

avg / total       0.62      0.63      0.62      3024



### 4. Random Forest

In [142]:
y_train = train_original['Cover_Type']
x_train = train_original.drop('Cover_Type', axis=1)

y_test = test_original['Cover_Type']
x_test = test_original.drop('Cover_Type', axis=1)

classifier = RandomForestClassifier(random_state=1234, n_estimators=40)
classifier.fit(x_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=40, n_jobs=1, oob_score=False, random_state=1234,
            verbose=0, warm_start=False)

In [143]:
y_pred = classifier.predict(x_test)

acc = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print "REPORT"
print report

REPORT
             precision    recall  f1-score   support

          1       0.81      0.78      0.79       443
          2       0.80      0.71      0.75       421
          3       0.86      0.85      0.85       428
          4       0.94      0.97      0.96       439
          5       0.90      0.93      0.92       437
          6       0.84      0.89      0.86       404
          7       0.94      0.98      0.96       452

avg / total       0.87      0.87      0.87      3024



In [109]:
y_train = train_combined['Cover_Type']
x_train = train_combined.drop('Cover_Type', axis=1)

y_test = test_combined['Cover_Type']
x_test = test_combined.drop('Cover_Type', axis=1)

classifier = RandomForestClassifier(random_state=1234)
classifier.fit(x_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=20, n_jobs=1, oob_score=False, random_state=1234,
            verbose=0, warm_start=False)

In [111]:
y_pred = classifier.predict(x_test)

acc = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print "REPORT"
print report

REPORT
             precision    recall  f1-score   support

        1.0       0.73      0.76      0.74       443
        2.0       0.71      0.62      0.66       421
        3.0       0.81      0.81      0.81       428
        4.0       0.92      0.97      0.94       439
        5.0       0.90      0.91      0.91       437
        6.0       0.82      0.82      0.82       404
        7.0       0.95      0.96      0.95       452

avg / total       0.84      0.84      0.84      3024



In [110]:
y_train = train_normalize[:, 54]
x_train = train_normalize[:,:54]

y_test = test_normalize[:,54]
x_test = test_normalize[:,:54]

classifier = RandomForestClassifier(random_state=1234)
classifier.fit(x_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=1234,
            verbose=0, warm_start=False)

In [108]:
y_pred = classifier.predict(x_test)

acc = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print "REPORT"
print report

REPORT
             precision    recall  f1-score   support

        1.0       0.73      0.76      0.74       443
        2.0       0.71      0.62      0.66       421
        3.0       0.81      0.81      0.81       428
        4.0       0.92      0.97      0.94       439
        5.0       0.90      0.91      0.91       437
        6.0       0.82      0.82      0.82       404
        7.0       0.95      0.96      0.95       452

avg / total       0.84      0.84      0.84      3024



### 5. ExtraTree

In [153]:
y_train = train_original['Cover_Type']
x_train = train_original.drop('Cover_Type', axis=1)

y_test = test_original['Cover_Type']
x_test = test_original.drop('Cover_Type', axis=1)

classifier = ExtraTreesClassifier(random_state=1234, n_estimators=40)
classifier.fit(x_train,y_train)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=40, n_jobs=1, oob_score=False, random_state=1234,
           verbose=0, warm_start=False)

In [158]:
y_pred = classifier.predict(x_test)

acc = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print "REPORT"
print 'accuracy ', acc
print report

REPORT
accuracy  0.894736842105
             precision    recall  f1-score   support

          1       0.80      0.95      0.87        21
          2       0.90      0.73      0.81        26
          3       0.79      0.79      0.79        19
          4       1.00      0.93      0.97        15
          5       0.95      0.95      0.95        22
          6       0.85      0.92      0.88        25
          7       1.00      1.00      1.00        24

avg / total       0.90      0.89      0.89       152



### 6. Run on test data

In [169]:
train_df = pd.read_csv('resource/train.csv')
test_df = pd.read_csv('resource/test.csv')

train_df.drop('Id', axis=1)

test_id = test_df['Id']
train_df.drop('Id', axis=1)

y_train = train_df['Cover_Type']
x_train = train_df.drop('Cover_Type', axis=1)
classifier = ExtraTreesClassifier(random_state=1234, n_estimators=40)
classifier.fit(x_train,y_train)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=40, n_jobs=1, oob_score=False, random_state=1234,
           verbose=0, warm_start=False)

In [170]:
x_test = test_df
y_pred = classifier.predict(x_test)

with open("submission.csv", "w") as subfile:
    subfile.write("Id,Cover_Type\n")
    for i in range(len(test_id)):
        subfile.write("%s,%s\n"%(test_id[i],y_pred[i]))

#### Score on Kaggle: 0.71231