In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 讀取資料

In [29]:
# 為了處理方便，把 'train.csv' 和 'test.csv' 合併起來，'test.csv'的 Weather 欄位用 0 補起來。
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_test['Label'] = np.zeros((len(df_test),))

# 以 train_end_idx 作為 'train.csv' 和 'test.csv' 分界列
train_end_idx = len(df)
df = pd.concat([df, df_test], sort=False)
df.head()

Unnamed: 0,Date,Loc,TempLow,TempHigh,Evaporation,Sun,WindDir,WindSpeed,DayWindDir,NightWindDir,...,DayHumidity,NightHumidity,Pressure9am,Pressure3pm,DayCloud,NightCloud,Temp9am,Temp3pm,Label,RISK_MM
0,2014/2/6,Mildura,18.2,32.0,4.4,11.06,WSW,49.0,WSW,W,...,31.838029,98.826652,,1011.968,2.4,,25.55,26.86,0.0,
1,2014/4/24,,2.0,,,5.3,N,67.0,,NNE,...,,68.618172,1022.624,1011.704,,3.705932,17.23,16.78,0.0,7.3
2,2013/4/5,Canberra,,24.8,5.389921,8.26,,20.0,,NNW,...,50.438684,,1027.328,1017.864,7.2,,5.53,19.16,0.0,0.0
3,,Richmond,,27.32,5.389921,,ESE,35.0,,NE,...,55.586129,97.610733,1034.384,1024.464,,3.705932,19.18,22.1,0.0,1.8
4,2011/8/13,,,26.84,-2.8,,S,61.0,,,...,,,1015.12,,,1.2,15.28,,0.0,19.6


# 資料preprocessing

- 去除無用欄位
- 把非數值欄位做fit_transform
- 對著Nan補植，這裡是用column的median來補

In [30]:
from sklearn.preprocessing import LabelEncoder

'''
###all columns###
['Date', 'Loc', 'TempLow', 'TempHigh', 'Evaporation', 'Sun', 'WindDir', 'WindSpeed', 'DayWindDir', 
'NightWindDir', 'WindSpeed9am', 'WindSpeed3pm', 'DayHumidity', 'NightHumidity', 'Pressure9am', 'Pressure3pm', 
'DayCloud', 'NightCloud', 'Temp9am', 'Temp3pm', 'Label', 'RISK_MM']
'''

### drop useless column ###
df = df.drop(columns = ['Date'])
labelencoder = LabelEncoder()

### transform nominal column to numeric one ###
df['Loc'] = labelencoder.fit_transform(df['Loc'])
df['WindDir'] = labelencoder.fit_transform(df['WindDir'])
df['DayWindDir'] = labelencoder.fit_transform(df['DayWindDir'])
df['NightWindDir'] = labelencoder.fit_transform(df['NightWindDir'])

# skip the Na values while finding the median 
df = df.fillna(df.median(axis = 0, skipna = True)) 

#print(df[df.Label == 0].shape[0], df[df.Label == 1].shape[0])
df.head()

Unnamed: 0,Loc,TempLow,TempHigh,Evaporation,Sun,WindDir,WindSpeed,DayWindDir,NightWindDir,WindSpeed9am,...,DayHumidity,NightHumidity,Pressure9am,Pressure3pm,DayCloud,NightCloud,Temp9am,Temp3pm,Label,RISK_MM
0,20,18.2,32.0,4.4,11.06,15,49.0,15,13,36.0,...,31.838029,98.826652,1020.198382,1011.968,2.4,3.705932,25.55,26.86,0.0,2.5
1,49,2.0,31.76,5.389921,5.3,3,67.0,16,5,49.0,...,55.999241,68.618172,1022.624,1011.704,4.133813,3.705932,17.23,16.78,0.0,7.3
2,9,12.8,24.8,5.389921,8.26,16,20.0,16,6,8.0,...,50.438684,86.336648,1027.328,1017.864,7.2,3.705932,5.53,19.16,0.0,0.0
3,34,12.8,27.32,5.389921,8.094582,2,35.0,16,4,24.0,...,55.586129,97.610733,1034.384,1024.464,4.133813,3.705932,19.18,22.1,0.0,1.8
4,49,12.8,26.84,-2.8,8.094582,8,61.0,16,16,19.0,...,55.999241,86.336648,1015.12,1013.610038,4.133813,1.2,15.28,26.02,0.0,19.6


# Dataset分割及處理

## split data into Traning and validation sets
- 分成training 及 validation set

## resampling
- SMOTE + ENN

## normalize
- use min-max normalization method

## feature selection
- use SelectKBest method

In [31]:
from sklearn.model_selection import train_test_split
from imblearn.combine import SMOTEENN
from sklearn import preprocessing
from imblearn.under_sampling import EditedNearestNeighbours
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

### get training & testing sets ###
# X : Features, y : Label
X_train, X_val, y_train, y_val = train_test_split(
    df.drop(columns = ['Label']).values[:train_end_idx, :],
    df['Label'].values[:train_end_idx], test_size=0.2)
X_test = df.drop(columns = ['Label']).values[train_end_idx:, :]


#X_res, y_res = X_train, y_train

### Use SMOTE+ENN resampling method ###
X_res, y_res = SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='all')).fit_resample(X_train, y_train)

###all feautres###
nor_col = ['Loc', 'TempLow', 'TempHigh', 'Evaporation', 'Sun', 'WindDir', 'WindSpeed', 'DayWindDir', 
'NightWindDir', 'WindSpeed9am', 'WindSpeed3pm', 'DayHumidity', 'NightHumidity', 'Pressure9am', 'Pressure3pm', 
'DayCloud', 'NightCloud', 'Temp9am', 'Temp3pm', 'RISK_MM']

### min-max normalization ###
scaler = preprocessing.MinMaxScaler()
scaler.fit(X_res)
X_res = scaler.transform(X_res)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)
#X_train=(X_train-X_train.min())/(X_train.max()-X_train.min())
#X_train = (X_train-X_train.mean())/X_train.std()
#print(df[nor_col].shape, df[nor_col])


### feature select ###

#select best k features
SelectK = SelectKBest(chi2, k=16)
SelectK.fit(X_res, y_res)
X_res = SelectK.transform(X_res)
X_val = SelectK.transform(X_val)
X_test = SelectK.transform(X_test)
'''
selectK_mask=SelectK.get_support()

# find no need column
selectK_mask=selectK_mask.tolist()
nor_col = [i for i in nor_col if selectK_mask[nor_col.index(i)] ]
print(selectK_mask)
drop_col = [i for i in range(len(selectK_mask)) if not selectK_mask[i] ]
print(drop_col)

# modify training set
X_res = np.delete(X_res, drop_col, 1)  # delete [drop_column] column of dataset
# of course we need to drop column for val and test too, otherwise Number of columns doesn't match number of features in booster
X_val = np.delete(X_val, drop_col, 1)
X_test = np.delete(X_test, drop_col, 1)
'''

"\nselectK_mask=SelectK.get_support()\n\n# find no need column\nselectK_mask=selectK_mask.tolist()\nnor_col = [i for i in nor_col if selectK_mask[nor_col.index(i)] ]\nprint(selectK_mask)\ndrop_col = [i for i in range(len(selectK_mask)) if not selectK_mask[i] ]\nprint(drop_col)\n\n# modify training set\nX_res = np.delete(X_res, drop_col, 1)  # delete [drop_column] column of dataset\n# of course we need to drop column for val and test too, otherwise Number of columns doesn't match number of features in booster\nX_val = np.delete(X_val, drop_col, 1)\nX_test = np.delete(X_test, drop_col, 1)\n"

# Training

In [32]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.ensemble import AdaBoostClassifier


estimators = [
    ('xgb', XGBClassifier(verbosity=0,
                      max_depth=15,
                      learning_rate=0.1,
                      n_estimators=2000,
                      min_child_weight=5,
                      max_delta_step=0,
                      subsample=0.8,
                      colsample_bytree=0.7,
                      reg_alpha=0,
                      reg_lambda=0.4,
                      scale_pos_weight=0.8,
                      silent=True,
                      objective='binary:logistic',
                      missing=None,
                      eval_metric='error',
                      seed=1440,
                      gamma=0) )
]

model = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression()
)
'''
#train xgboost model
model = XGBClassifier(verbosity=0,
                      max_depth=15,
                      learning_rate=0.1,
                      n_estimators=2000,
                      min_child_weight=5,
                      max_delta_step=0,
                      subsample=0.8,
                      colsample_bytree=0.7,
                      reg_alpha=0,
                      reg_lambda=0.4,
                      scale_pos_weight=0.8,
                      silent=True,
                      objective='binary:logistic',
                      missing=None,
                      eval_metric='error',
                      seed=1440,
                      gamma=0)
'''
model.fit_transform(X_res,y_res)

#model = RandomForestClassifier(max_depth=4, random_state=0)
#model.fit(X_res,y_res)
#model = LogisticRegression().fit(X_res,y_res)

#predict
y_pred_decision = model.predict(X_val)
print('Accuracy: %f' % accuracy_score(y_val, y_pred_decision))
print('f1-score: %f' % f1_score(y_val, y_pred_decision))



Accuracy: 0.831769
f1-score: 0.405826


In [19]:
ans_pred = model.predict(X_test)
df_sap = pd.DataFrame(ans_pred.astype(int), columns = ['Label'])
df_sap.to_csv('myAns.csv',  index_label = 'Id')

In [None]:
Accuracy: 0.946672
f1-score: 0.689312
    
Accuracy: 0.852673
f1-score: 0.460417