In [1]:
import pandas as pd
from pandas.api.types import CategoricalDtype

import warnings
from sklearn.exceptions import DataConversionWarning
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
warnings.filterwarnings(action='ignore', category=DataConversionWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

#### Read tsv file with **sep='\t'**

In [3]:
data = pd.read_csv("train.tsv", sep='\t')

In [4]:
data.head()

Unnamed: 0,id,Y,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,0,p,f,y,n,f,f,f,c,n,...,k,w,w,p,w,o,e,w,v,d
1,2,p,f,y,y,f,f,f,c,b,...,k,b,p,p,w,o,l,h,y,g
2,3,e,b,y,w,t,l,f,c,b,...,s,w,w,p,w,o,p,k,n,m
3,5,p,x,s,b,t,f,f,c,b,...,s,w,w,p,w,o,p,h,v,u
4,7,p,x,s,w,t,f,f,c,b,...,f,w,w,p,w,o,p,h,v,g


#### Check null

In [5]:
data.isnull().sum()

id                          0
Y                           0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [6]:
y = data.Y
y = y.to_frame()

#### Set p = 1 and e = 0 
#### while 1 means toxic and 0 means safe

In [7]:
toxic = y.Y == 'p'
column_name = 'Y'
y.loc[toxic, column_name] = 1
safe = y.Y == 'e'
column_name = 'Y'
y.loc[safe, column_name] = 0

In [8]:
y['Y'].value_counts()

0    2103
1    1959
Name: Y, dtype: int64

In [9]:
X = data

In [10]:
X = X.drop(['id', 'Y'], 1)
X

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,f,y,n,f,f,f,c,n,b,t,...,k,w,w,p,w,o,e,w,v,d
1,f,y,y,f,f,f,c,b,p,e,...,k,b,p,p,w,o,l,h,y,g
2,b,y,w,t,l,f,c,b,k,e,...,s,w,w,p,w,o,p,k,n,m
3,x,s,b,t,f,f,c,b,h,t,...,s,w,w,p,w,o,p,h,v,u
4,x,s,w,t,f,f,c,b,w,t,...,f,w,w,p,w,o,p,h,v,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4057,f,y,n,f,s,f,c,n,b,t,...,k,p,p,p,w,o,e,w,v,p
4058,x,f,y,f,f,f,c,b,g,e,...,k,b,b,p,w,o,l,h,v,p
4059,k,y,n,f,y,f,c,n,b,t,...,s,w,p,p,w,o,e,w,v,d
4060,x,s,n,t,p,f,c,n,p,e,...,s,w,w,p,w,o,p,k,v,u


#### set all features to dummy variables

In [11]:
for feature in X:
    dummies = pd.get_dummies(X[feature],prefix=feature+"_")
    X = X.drop(feature, 1)
    X = pd.concat([X, dummies], axis=1)

In [12]:
X

Unnamed: 0,cap-shape__b,cap-shape__c,cap-shape__f,cap-shape__k,cap-shape__s,cap-shape__x,cap-surface__f,cap-surface__g,cap-surface__s,cap-surface__y,...,population__s,population__v,population__y,habitat__d,habitat__g,habitat__l,habitat__m,habitat__p,habitat__u,habitat__w
0,0,0,1,0,0,0,0,0,0,1,...,0,1,0,1,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,1,...,0,0,1,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,1,0,0,1,0,...,0,1,0,0,0,0,0,0,1,0
4,0,0,0,0,0,1,0,0,1,0,...,0,1,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4057,0,0,1,0,0,0,0,0,0,1,...,0,1,0,0,0,0,0,1,0,0
4058,0,0,0,0,0,1,1,0,0,0,...,0,1,0,0,0,0,0,1,0,0
4059,0,0,0,1,0,0,0,0,0,1,...,0,1,0,1,0,0,0,0,0,0
4060,0,0,0,0,0,1,0,0,1,0,...,0,1,0,0,0,0,0,0,1,0


In [13]:
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

#### Use logistic regression

In [14]:
logreg = LogisticRegression()
logreg.fit(train_X, train_y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [15]:
y_pred_logreg = logreg.predict(val_X)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(val_X, val_y)))

Accuracy of logistic regression classifier on test set: 1.00


#### Use Random Forest Classifier

In [16]:
RFC = RandomForestClassifier()
RFC.fit(train_X, train_y)
y_pred_RFC = RFC.predict(val_X)
print('Accuracy of RFC on test set: {:.2f}'.format(RFC.score(val_X, val_y)))

Accuracy of RFC on test set: 1.00


#### Read test data

In [17]:
test_data = pd.read_csv('test.tsv', sep='\t')
test_X = test_data.drop('id', 1)

In [18]:
for feature in test_X:
    dummies = pd.get_dummies(test_X[feature],prefix=feature+"_")
    test_X = test_X.drop(feature, 1)
    test_X = pd.concat([test_X, dummies], axis=1)

In [19]:
test_X

Unnamed: 0,cap-shape__b,cap-shape__c,cap-shape__f,cap-shape__k,cap-shape__s,cap-shape__x,cap-surface__f,cap-surface__g,cap-surface__s,cap-surface__y,...,population__s,population__v,population__y,habitat__d,habitat__g,habitat__l,habitat__m,habitat__p,habitat__u,habitat__w
0,0,0,0,0,0,1,0,0,1,0,...,0,1,0,0,1,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,1,...,0,0,1,1,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,1,...,0,0,1,1,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,1,0,...,0,1,0,0,1,0,0,0,0,0
4,0,0,0,0,0,1,0,0,1,0,...,0,1,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4057,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
4058,0,0,1,0,0,0,0,0,1,0,...,1,0,0,0,1,0,0,0,0,0
4059,0,0,0,0,0,1,1,0,0,0,...,0,1,0,1,0,0,0,0,0,0
4060,0,0,1,0,0,0,0,0,1,0,...,1,0,0,0,1,0,0,0,0,0


In [20]:
test_preds_logreg = logreg.predict(test_X)

In [21]:
output_logreg = pd.DataFrame({'id': test_data.id,
                       'Output': test_preds_logreg})

In [22]:
output_logreg.to_csv('submission_r.csv', index=False)

In [23]:
test_preds_RFC = RFC.predict(test_X)


In [24]:
output_RFC = pd.DataFrame({'id': test_data.id,
                       'Output': test_preds_RFC})

In [25]:
output_RFC.to_csv('submission_RFC.csv', index=False)