In [1]:
import seaborn
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt

from sklearn import neighbors, datasets
from sklearn import svm


from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeRegressor

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import VotingClassifier

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

import xgboost as xgb

In [2]:
# Pandas --> numpy
class DataFrameSelector(BaseEstimator, TransformerMixin):

    def __init__(self, attribute_names):
        self.attribute_names = attribute_names

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.attribute_names].values

In [3]:
# Preencher vazios
class Preencher_vazios(BaseEstimator, TransformerMixin):

    def __init__(self, values):
        self.values = values

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # values = {'Age': train['Age'].median(),'Cabin':'C1000','Embarked':'X'}
        X.fillna(value=self.values, inplace = True)        
        return X

In [4]:
# Transforma valores (A,cam, ve) em numeros(0,1,2,..) - aplicavel em todo o DataFrame 
class Transformar_categoricos(BaseEstimator, TransformerMixin):

    def __init__(self):
        pass
        
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.apply(lambda x: pd.factorize(x)[0])        
        return X

In [5]:
def zero_um(matriz):
    matriz[np.where(matriz<=0.50)] = 0
    matriz[np.where(matriz>0.50)] = 1
    return matriz

In [6]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [7]:
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
    plt.xlabel("Threshold")
    plt.legend(loc="upper left")
    plt.ylim([0, 1])
    #plt.xlim([0, 1])

In [8]:
def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')

In [9]:
train = pd.read_csv('train.csv')
test_set = pd.read_csv('test.csv')

In [10]:
train.corr()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.005007,-0.035144,0.036847,-0.057527,-0.001652,0.012658
Survived,-0.005007,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307
Pclass,-0.035144,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495
Age,0.036847,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067
SibSp,-0.057527,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651
Parch,-0.001652,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225
Fare,0.012658,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0


In [11]:
test_set.corr()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.026751,-0.034102,0.003818,0.04308,0.008211
Pclass,-0.026751,1.0,-0.492143,0.001087,0.018721,-0.577147
Age,-0.034102,-0.492143,1.0,-0.091587,-0.061249,0.337932
SibSp,0.003818,0.001087,-0.091587,1.0,0.306895,0.171539
Parch,0.04308,0.018721,-0.061249,0.306895,1.0,0.230046
Fare,0.008211,-0.577147,0.337932,0.171539,0.230046,1.0


In [12]:
class Juntar_tratamento(BaseEstimator, TransformerMixin):

    def __init__(self):
        pass

    def fit(self, train, test_set):
        return self

    def transform(self, train, test_set, y=None):
        #Juntar para tratamento
        train = train.set_index('PassengerId')
        test_set = test_set.set_index('PassengerId')

        lista_index_train = train.index.tolist()
        lista_index_test_set = test_set.index.tolist()

        todos = pd.concat([train,test_set],axis=0)

        test_set = todos.loc[lista_index_test_set].reset_index()
        todos = todos.reset_index()

        # Separar a coluna dos sobreviventes
        train_label = train['Survived']
        train_set = todos.loc[lista_index_train].drop('Survived', axis=1)
        
        return todos, train_label 
        

s = Juntar_tratamento()
todos = s.transform(train,test_set)  

In [13]:
class Pronomes_tratamento(BaseEstimator, TransformerMixin):

    def __init__(self):
        pass

    def fit(self, todos, y=None):
        return self

    def transform(self, todos):
        # Categorizar pronomes de tratamento
        l = list()
        todos['Name_3'] = ""
        for index, row in todos.iterrows():   
            try:
                a = float(todos.iloc[index]['Age'])

                if np.isnan(a):
                    tipo = 'nan'   
                elif not np.isnan(a):
                    tipo = 'ok'
            except:
                tipo = 'nan'

            jl = todos.iloc[index]['Name']
            c = re.match('(.*?)[\s]*[\,][\s]*', jl)
            a = re.match('(.*?)[\s]*[\,][\s]*Mr[\.\s].*', jl)
            b = re.match('(.*?)[\s]*[\,][\s]*Dr[\.\s].*', jl)

            d = re.match('(.*?)[\s]*[\,][\s]*Master[\.\s].*', jl)    

            e = re.match('(.*?)[\s]*[\,][\s]*Miss[\.\s].*', jl)
            f = re.match('(.*?)[\s]*[\,][\s]*Mrs[\.\s].*', jl)
            g = re.match('(.*?)[\s]*[\,][\s]*Ms[\.\s].*', jl)

            h = re.match('(.*?)[\s]*[\,][\s]Rev[\.\s].*', jl)

            i = re.match('(.*?)[\s]*[\,][\s]*Col[\.\s].*', jl)
            j = re.match('(.*?)[\s]*[\,][\s]*Major[\.\s].*', jl)

            if   a: 
                l.append(0)
            elif b:
                l.append(0)
            elif d:
                l.append(0)
            elif e:       
                l.append(1)
            elif f:       
                l.append(2)
            elif g:       
                l.append(3)
            elif h:       
                l.append(0)
            elif i:       
                l.append(0)
            elif j:       
                l.append(0)
            else:
                l.append(4) 

            if c:
                todos.at[index, 'Name_3'] = c.group(1) #coluna de sobrenomes

         # Atribuir uma idade que esteja faltando - filtrar pro grupo   
            if (a or b  or e or g or h or i or f) and (tipo == 'nan'):
                todos.at[index, 'Age'] = 28.

            elif (d) and (tipo == 'nan'):       
                todos.at[index, 'Age'] = 4.

            elif (tipo == 'nan'):
                todos.at[index, 'Age'] = 28.

        todos['Name_2'] = l
        todos['Cabin_2']= todos['Cabin'].str.extract('(.).*', expand=False)
        
        return todos

s = Pronomes_tratamento()
todos = s.transform(todos)  

In [14]:
class Sobrevivente_familia(BaseEstimator, TransformerMixin):

    def __init__(self):
        pass

    def fit(self, todos, y=None):
        return self

    def transform(self, todos):
        # Cria uma nova coluna com a chance de sobreviver de acordo com a familia.
        count = 0
        nosurv = 0
        surv = 0
        s_ns = list()
        s_ns_novo = list()
        c_dict ={}
        c_dict_baixo ={}
        c_dict_baixo_2 = {}
        c_dict_alto ={}
        var = len(todos[0:891][todos[0:891]['Survived']==1])/len(todos[0:891])
        todos['Name_4'] = ''
        todos['numero_sobrevivente_familia'] =''
        todos['numero_total_familia'] =''
        todos['relacao_s_ns_familia'] = ''

        #calcula o numero de sobrevivente de cada familia e cria uma coluna
        for index, row in todos[0:891].iterrows():  
            if todos[0:891].iloc[index]['Name_3'] not in c_dict.keys():
                c_dict[todos[0:891].iloc[index]['Name_3']] = [0,0]

            s_ns = c_dict[todos[0:891].iloc[index]['Name_3']]

            if todos[0:891].iloc[index]['Survived'] == 0:            
                s_ns_novo = [s_ns[0] + 1, s_ns[1]]
            elif todos[0:891].iloc[index]['Survived'] == 1: 
                s_ns_novo = [s_ns[0], s_ns[1] + 1]

            c_dict[todos.iloc[index]['Name_3']] = s_ns_novo  

        for index, row in todos.iterrows():  
            if todos.iloc[index]['Name_3'] in c_dict:
                todos.at[index, 'numero_sobrevivente_familia'] =  c_dict[todos.iloc[index]['Name_3']][1] 

                todos.at[index, 'numero_total_familia'] =  sum(c_dict[todos.iloc[index]['Name_3']] ) 

                if todos.iloc[index]['numero_total_familia'] > 1:
                    todos.at[index, 'relacao_s_ns_familia'] = todos.iloc[index]['numero_sobrevivente_familia']/todos.iloc[index]['numero_total_familia']
                else:
                    todos.at[index, 'relacao_s_ns_familia'] = var

            if todos.iloc[index]['Name_3'] not in c_dict:
                todos.at[index, 'numero_sobrevivente_familia'] =  0.

                todos.at[index, 'numero_total_familia'] =  0.

                todos.at[index, 'relacao_s_ns_familia'] = var

            if todos.iloc[index]['relacao_s_ns_familia'] >= 0. and todos.iloc[index]['relacao_s_ns_familia'] < 0.25:
                todos.at[index, 'relacao_s_ns_familia'] = 0.25
            elif todos.iloc[index]['relacao_s_ns_familia'] >= 0.25 and todos.iloc[index]['relacao_s_ns_familia'] < 0.5:        
                todos.at[index, 'relacao_s_ns_familia'] = 0.5
            elif todos.iloc[index]['relacao_s_ns_familia'] >= 0.5 and todos.iloc[index]['relacao_s_ns_familia'] < 0.75:        
                todos.at[index, 'relacao_s_ns_familia'] = 0.75      
            elif todos.iloc[index]['relacao_s_ns_familia'] >= 0.75:       
                todos.at[index, 'relacao_s_ns_familia'] = 1.0   

        todos['Name_4'] =  todos['relacao_s_ns_familia']        
        todos = todos.infer_objects()
        return todos

s = Sobrevivente_familia()
todos = s.transform(todos)  

In [15]:
class Sobrevivente_cabine(BaseEstimator, TransformerMixin):

    def __init__(self):
        pass

    def fit(self, todos, y=None):
        return self

    def transform(self, todos):
        # Cria uma nova coluna com a chance de sobreviver de acordo com a cabine.
        count = 0
        nosurv = 0
        surv = 0
        s_ns = list()
        s_ns_novo = list()
        c_dict ={}
        c_dict_baixo ={}
        c_dict_baixo_2 = {}
        c_dict_alto ={}
        var = len(todos[0:891][todos[0:891]['Survived']==1])/len(todos[0:891])
        todos['Cabin_3'] = ""
        todos['numero_sobrevivente_cabine'] =''
        todos['numero_total_cabine'] =''
        todos['relacao_s_ns_cabine'] = ''

        #calcula o numero de sobrevivente de cada cabine 
        for index, row in todos[0:891].iterrows():  
            if todos[0:891].iloc[index]['Cabin_2'] not in c_dict.keys():
                c_dict[todos[0:891].iloc[index]['Cabin_2']] = [0,0]

            s_ns = c_dict[todos[0:891].iloc[index]['Cabin_2']]

            if todos[0:891].iloc[index]['Survived'] == 0:            
                s_ns_novo = [s_ns[0] + 1, s_ns[1]]
            elif todos[0:891].iloc[index]['Survived'] == 1: 
                s_ns_novo = [s_ns[0], s_ns[1] + 1]

            c_dict[todos.iloc[index]['Cabin_2']] = s_ns_novo  

        for index, row in todos.iterrows():  
            if todos.iloc[index]['Cabin_2'] in c_dict:
                todos.at[index, 'numero_sobrevivente_cabine'] =  c_dict[todos.iloc[index]['Cabin_2']][1] 

                todos.at[index, 'numero_total_cabine'] =  sum(c_dict[todos.iloc[index]['Cabin_2']] ) 

                if todos.iloc[index]['numero_total_cabine'] > 1:
                    todos.at[index, 'relacao_s_ns_cabine'] = todos.iloc[index]['numero_sobrevivente_cabine']/todos.iloc[index]['numero_total_cabine']
                else:
                    todos.at[index, 'relacao_s_ns_cabine'] = var

            if todos.iloc[index]['Cabin_2'] not in c_dict:
                todos.at[index, 'numero_sobrevivente_cabine'] =  0.

                todos.at[index, 'numero_total_cabine'] =  0.

                todos.at[index, 'relacao_s_ns_cabine'] = var

            if todos.iloc[index]['relacao_s_ns_cabine'] >= 0. and todos.iloc[index]['relacao_s_ns_cabine'] < 0.25:
                todos.at[index, 'relacao_s_ns_cabine'] = 0.25
            elif todos.iloc[index]['relacao_s_ns_cabine'] >= 0.25 and todos.iloc[index]['relacao_s_ns_cabine'] < 0.5:        
                todos.at[index, 'relacao_s_ns_cabine'] = 0.5
            elif todos.iloc[index]['relacao_s_ns_cabine'] >= 0.5 and todos.iloc[index]['relacao_s_ns_cabine'] < 0.75:        
                todos.at[index, 'relacao_s_ns_cabine'] = 0.75      
            elif todos.iloc[index]['relacao_s_ns_cabine'] >= 0.75:       
                todos.at[index, 'relacao_s_ns_cabine'] = 1.0   

        todos['Cabin_3'] =  todos['relacao_s_ns_cabine']        
        todos = todos.infer_objects()
        return todos


s = Sobrevivente_cabine()
todos = s.transform(todos)  

In [16]:
num_attribs = ['Fare','Parch','SibSp', 'Age','Name_4','Cabin_3'] #
cat_attribs = ['Pclass','Sex','Embarked']
values_num = {'Fare': train['Fare'].median()}
values_cat = {'Embarked' : 'X'}

In [17]:
todos, train_label  = Juntar_tratamento().transform(train, test_set)

In [18]:
transformacao_pipeline = Pipeline([
    ('Pronomes_tratamento', Pronomes_tratamento()),
    ('Sobrevivente_familia', Sobrevivente_familia()),
    ('Sobrevivente_cabine', Sobrevivente_cabine())
    ])

In [19]:
todos = transformacao_pipeline.fit_transform(todos)

In [20]:
todos_filtrado = todos[['Fare','Parch','SibSp', 'Age','Name_4','Cabin_3','Pclass','Sex','Embarked']]

In [23]:
num_pipeline = Pipeline([
        ('Preencher_vazios', Preencher_vazios(values_num)),
        ('pd-np', DataFrameSelector(num_attribs)),
        ('std_scaler', StandardScaler()),
    ])
cat_pipeline = Pipeline([
        ('Preencher_vazios', Preencher_vazios(values_cat)),
        ('Transformar_categoricos',Transformar_categoricos()),
        ('pd-np', DataFrameSelector(cat_attribs)),
        ('OneHotEncoder',OneHotEncoder()),
    ])

full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

In [24]:
cat_tr = full_pipeline.fit_transform(todos_filtrado)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [25]:
train2 = cat_tr[0:891]
train3 = cat_tr[891:]

In [26]:
#SGD######################################################################
sgd_clf = SGDClassifier(max_iter=1000)
sgd_clf.fit(train2, train_label)

s = sgd_clf.predict(train2)
print (accuracy_score(s, train_label))

t = sgd_clf.predict(train3)
t = zero_um(t)

result_SGD = pd.concat([pd.DataFrame(data=t,columns =['Survived']),
    test_set['PassengerId'].to_frame()],axis=1).set_index('PassengerId')\
    .astype('int64')

#res2 = np.hstack([res2, np.array(result_SGD)])

result_SGD.to_csv('SGD.csv')

scores = cross_val_score(sgd_clf, train2, train_label,
     scoring="accuracy", cv=10)

display_scores(scores)

0.8608305274971941
Scores: [0.88888889 0.84444444 0.82022472 0.85393258 0.8988764  0.82022472
 0.86516854 0.84269663 0.86516854 0.85227273]
Mean: 0.855189819543752
Standard deviation: 0.024433673757311536


In [27]:
knn=neighbors.KNeighborsClassifier()
knn.fit(train2, train_label)

s = knn.predict(train2)
print (accuracy_score(s, train_label))

t = knn.predict(train3)
result_knn = pd.concat([pd.DataFrame(data=t,columns =['Survived']),
    test_set['PassengerId'].to_frame()],axis=1).set_index('PassengerId')\
    .astype('int64')
#res2 = np.hstack([res2, np.array(result_knn)])
result_knn.to_csv('knn.csv')

scores = cross_val_score(knn, train2, train_label,
     scoring="accuracy", cv=10)

display_scores(scores)

0.8888888888888888
Scores: [0.85555556 0.88888889 0.78651685 0.83146067 0.87640449 0.84269663
 0.85393258 0.83146067 0.88764045 0.85227273]
Mean: 0.8506829531267733
Standard deviation: 0.02905862893780325


In [28]:
svwm=svm.SVC(C= 35938.0, gamma= 0.000562)
#svwm=svm.SVC()
svwm.fit(train2, train_label)

s = svwm.predict(train2)
print (accuracy_score(s, train_label))

t = svwm.predict(train3)

result_svn = pd.concat([pd.DataFrame(data=t,columns =['Survived']),
    test_set['PassengerId'].to_frame()],axis=1).set_index('PassengerId')\
    .astype('int64')

result_svn.to_csv('svn.csv')

scores = cross_val_score(svwm, train2, train_label,
     scoring="accuracy", cv=10)

display_scores(scores)

0.8821548821548821
Scores: [0.87777778 0.9        0.83146067 0.87640449 0.8988764  0.82022472
 0.85393258 0.86516854 0.88764045 0.86363636]
Mean: 0.8675122006582681
Standard deviation: 0.025237890296031417


In [29]:
gbm = xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.1).fit(train2, train_label)

s = gbm.predict(train2)

print (accuracy_score(s, train_label))

t = gbm.predict(train3)

result_xgb = pd.concat([pd.DataFrame(data=t,columns =['Survived']),
    test_set['PassengerId'].to_frame()],axis=1).set_index('PassengerId')\
    .astype('int64')
    
    
#res2 = np.hstack([res2, np.array(result_xgb)])

result_xgb.to_csv('xgb.csv')

scores = cross_val_score(gbm, train2, train_label,
     scoring="accuracy", cv=10)

display_scores(scores)

0.9281705948372615


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


Scores: [0.88888889 0.9        0.82022472 0.8988764  0.95505618 0.83146067
 0.91011236 0.86516854 0.86516854 0.875     ]
Mean: 0.8809956304619225
Standard deviation: 0.03709406599925418


  if diff:


In [30]:
## RandomForestRegressor
#forest_reg = RandomForestRegressor()
forest_reg =RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=8, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=700, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

forest_reg.fit(train2, train_label)
s = forest_reg.predict(train2)
s = zero_um(s)
print (accuracy_score(s, train_label))

t = forest_reg.predict(train3)
t = zero_um(t)

result_random_forest = pd.concat([pd.DataFrame(data=t,columns =['Survived']),
    test_set['PassengerId'].to_frame()],axis=1).set_index('PassengerId')\
    .astype('int64')

result_random_forest.to_csv('random_forest.csv')

0.9831649831649831


In [31]:
log_clf = LogisticRegression()
svm_clf = svm.SVC(C= 35938.0, gamma= 0.000562, probability=True)
sgd_clf = SGDClassifier(max_iter=1000,loss='log')
forest_reg =RandomForestRegressor()

voting_clf = VotingClassifier(
        estimators=[('lr', log_clf),('gbm',gbm), ('knn',knn), ('svc', svm_clf), ('sgd_dlf', sgd_clf)],
        voting='soft'
    )
voting_clf.fit(train2, train_label)

t = voting_clf.predict(train3)

result_votes = pd.concat([pd.DataFrame(data=t,columns =['Survived']),
    test_set['PassengerId'].to_frame()],axis=1).set_index('PassengerId')\
    .astype('int64')

result_votes.to_csv('result_votes.csv')

  if diff:


In [44]:
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth=3, n_estimators=3, learning_rate=1.0)
gbrt.fit(train2, train_label)
s = gbrt.predict(train2)
s = zero_um(s)
print (accuracy_score(s, train_label))

t = gbrt.predict(train3)



0.8765432098765432


In [40]:
s

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=1.0, loss='ls', max_depth=2, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=3, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)