In [1]:
## Import pandas 
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_predict
import time
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
## Import dataset 

train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
col = None


In [5]:
class ShapeCabin(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X.Cabin = X['Cabin'].astype(str).str[0]
        a = pd.get_dummies(X.Cabin)
        X.drop("Cabin", axis=1, inplace=True)
        X = pd.concat([X,a],axis=1)
        return X

In [6]:
class ShapeEmbarked(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        a = pd.get_dummies(X.Embarked, prefix='em')
        X.drop("Embarked", axis=1, inplace=True)
        X = pd.concat([X,a],axis=1)
        return X

In [7]:
class Drop(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X.drop('PassengerId', axis=1, inplace=True)
        X.drop('Ticket', axis=1, inplace=True)
        X.drop('Name', axis=1, inplace=True)
        return X

In [8]:
class HotSex(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        le = LabelEncoder()
        y_encoded = le.fit_transform(X['Sex'])
        X["Sex"] = y_encoded
        global col
        col = X.columns

        return X

In [9]:
from sklearn.compose import ColumnTransformer

In [10]:

tratar_dados = Pipeline([
    ('drop',Drop()),
    ('tratar_cabins',ShapeCabin()),
    ('tratar_embarked',ShapeEmbarked()),
    ('hot_encoded_seeex',HotSex()),
    ('imputer',SimpleImputer(strategy="median")),
])

In [11]:
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [12]:
df = tratar_dados.fit_transform(train)

In [13]:
df = pd.DataFrame(df, columns=col)

In [14]:
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,A,B,C,D,E,F,G,T,n,em_C,em_Q,em_S
0,0.0,3.0,1.0,22.0,1.0,0.0,7.2500,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,1.0,1.0,0.0,38.0,1.0,0.0,71.2833,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1.0,3.0,0.0,26.0,0.0,0.0,7.9250,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,1.0,1.0,0.0,35.0,1.0,0.0,53.1000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,3.0,1.0,35.0,0.0,0.0,8.0500,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0.0,2.0,1.0,27.0,0.0,0.0,13.0000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
887,1.0,1.0,0.0,19.0,0.0,0.0,30.0000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
888,0.0,3.0,0.0,28.0,1.0,2.0,23.4500,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
889,1.0,1.0,1.0,26.0,0.0,0.0,30.0000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [15]:
X_train, X_test, y_train, y_test = train_test_split(df.drop("Survived", axis=1), df['Survived'], test_size=0.33)

In [16]:
from  sklearn.ensemble import RandomForestClassifier

In [17]:
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,A,B,C,D,E,F,G,T,n,em_C,em_Q,em_S
621,1.0,1.0,42.0,1.0,0.0,52.5542,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
13,3.0,1.0,39.0,1.0,5.0,31.2750,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
820,1.0,0.0,52.0,1.0,1.0,93.5000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
154,3.0,1.0,28.0,0.0,0.0,7.3125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
404,3.0,0.0,20.0,0.0,0.0,8.6625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
310,1.0,0.0,24.0,0.0,0.0,83.1583,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
190,2.0,0.0,32.0,0.0,0.0,13.0000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
502,3.0,0.0,28.0,0.0,0.0,7.6292,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
441,3.0,1.0,20.0,0.0,0.0,9.5000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [18]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {"n_estimators":[3,10,30], "max_features":[2, 4, 6, 8],"max_depth" : [15,20]},
    {"bootstrap": [False], 'n_estimators':[3, 10], 'max_features':[2,3,4],"max_depth" : [15,20]},
]

forest_reg = RandomForestClassifier()

grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                          scoring="neg_mean_squared_error",
                          return_train_score=True)
training_start = time.perf_counter()
grid_search.fit(X_train, y_train)


# rfc = RandomForestClassifier(n_estimators=10)
# rfc.fit(X_train, y_train)
training_end = time.perf_counter()
prediction_start = time.perf_counter()
#preds = rfc.predict(X_test)
preds = grid_search.predict(X_test)

prediction_end = time.perf_counter()
acc_rfc = (preds == y_test).sum().astype(float) / len(preds)*100
rfc_train_time = training_end-training_start
rfc_prediction_time = prediction_end-prediction_start
print("Scikit-Learn's Random Forest Classifier's prediction accuracy is: %3.2f" % (acc_rfc))
print("Time consumed for training: %4.3f seconds" % (rfc_train_time))
print("Time consumed for prediction: %6.5f seconds" % (rfc_prediction_time))

Scikit-Learn's Random Forest Classifier's prediction accuracy is: 80.00
Time consumed for training: 4.136 seconds
Time consumed for prediction: 0.00458 seconds


In [19]:
feature_importances= grid_search.best_estimator_.feature_importances_
col2 = X_train.columns
xerene = {'feature': feature_importances, 'valor': col2}
pd.DataFrame(xerene).sort_values(by=['feature'])

Unnamed: 0,feature,valor
13,0.00038,T
12,0.001278,G
11,0.002433,F
9,0.003769,D
6,0.004386,A
8,0.005385,C
7,0.005466,B
16,0.008255,em_Q
10,0.008331,E
15,0.01071,em_C


In [20]:
xgb = XGBClassifier(n_estimators=100)
training_start = time.perf_counter()
xgb.fit(X_train, y_train)
training_end = time.perf_counter()
prediction_start = time.perf_counter()
preds = xgb.predict(X_test)
prediction_end = time.perf_counter()
acc_xgb = (preds == y_test).sum().astype(float) / len(preds)*100
xgb_train_time = training_end-training_start
xgb_prediction_time = prediction_end-prediction_start
print("XGBoost's prediction accuracy is: %3.2f" % (acc_xgb))
print("Time consumed for training: %4.3f" % (xgb_train_time))
print("Time consumed for prediction: %6.5f seconds" % (xgb_prediction_time))

XGBoost's prediction accuracy is: 82.71
Time consumed for training: 0.046
Time consumed for prediction: 0.00176 seconds


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 19 columns):
Survived    891 non-null float64
Pclass      891 non-null float64
Sex         891 non-null float64
Age         891 non-null float64
SibSp       891 non-null float64
Parch       891 non-null float64
Fare        891 non-null float64
A           891 non-null float64
B           891 non-null float64
C           891 non-null float64
D           891 non-null float64
E           891 non-null float64
F           891 non-null float64
G           891 non-null float64
T           891 non-null float64
n           891 non-null float64
em_C        891 non-null float64
em_Q        891 non-null float64
em_S        891 non-null float64
dtypes: float64(19)
memory usage: 132.4 KB


In [22]:
sub = tratar_dados.fit_transform(test)
sub = pd.DataFrame(sub, columns=col)
sub

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,A,B,C,D,E,F,G,n,em_C,em_Q,em_S
0,3.0,1.0,34.5,0.0,0.0,7.8292,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,3.0,0.0,47.0,1.0,0.0,7.0000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,2.0,1.0,62.0,0.0,0.0,9.6875,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,3.0,1.0,27.0,0.0,0.0,8.6625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,3.0,0.0,22.0,1.0,1.0,12.2875,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,3.0,1.0,27.0,0.0,0.0,8.0500,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
414,1.0,0.0,39.0,0.0,0.0,108.9000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
415,3.0,1.0,38.5,0.0,0.0,7.2500,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
416,3.0,1.0,27.0,0.0,0.0,8.0500,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [23]:
sub["T"]=[0]*418

In [24]:
preds = grid_search.predict(sub)
submit = pd.DataFrame({"PassengerId":range(892,1310),"Survived":preds})
submit.Survived = submit.Survived.astype(int)

In [25]:
submit

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [26]:
submit.to_csv("rfc.csv",index=False)