In [94]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
import re
%matplotlib inline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
import category_encoders as ce
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

In [95]:
train_df=pd.read_csv('train.csv')

In [96]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [97]:
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [98]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [99]:
features_nan=[feature for feature in train_df.columns if train_df[feature].isnull().sum()>1 and train_df[feature].dtypes=='O']

for feature in features_nan:
    print("{}: {}% missing values".format(feature,np.round(train_df[feature].isnull().mean(),4)))

Cabin: 0.771% missing values
Embarked: 0.0022% missing values


In [100]:
def age_to_group(age):
    if 0 < age < 5:
        # children
        return 0
    elif 5 <= age < 12:
        # adult
        return 1
    elif 12 <= age < 16:
        # adult
        return 2
    elif 16 <= age < 21:
        # adult
        return 3
    elif 21 <= age < 26:
        # adult
        return 4
    elif 26<= age < 32:
        # adult
        return 5
    elif 32 <= age < 40:
        # adult
        return 6
    elif 40 <= age < 50:
        # adult
        return 7
    elif age >= 50:
        # elderly people
        return 8

In [101]:
def clean_name_title(val):
    if val not in ['Mr', 'Miss', 'Mrs', 'Master', 'Dr','Rev']:
        return 'RARE'
    else:
        return val

In [102]:
def get_cabin(cabin_no):
    if cabin_no=="U":
        return 0
    else:
        return 1

In [103]:
def fare_grop_div(fare):
    if 0 < fare < 10:
        # children
        return 0
    elif 10 <= fare < 20:
        # adult
        return 1
    elif 20 <= fare < 30:
        return 2
    elif 30 <= fare < 40:
        return 3
    elif 40 <= fare < 50:
        return 4
    elif 50 <= fare < 60:
        return 5
    elif 60 <= fare < 70:
        return 6
    elif  fare > 70:
        return 7

In [104]:
def process_data(df,train):
    df['Cabin'].fillna('U',inplace=True)
    df['Cabin']=df['Cabin'].apply(get_cabin)
    le = LabelEncoder()
    df['Cabin'] = le.fit_transform(df['Cabin'])
    df['total_family_members'] = df['Parch'] + df['SibSp'] + 1
    df['name_title'] = df['Name'].str.extract('([A-Za-z]+)\.', expand=False)
    print('name_values')
    print(df['name_title'].value_counts())
    df.drop(['PassengerId','Name','Parch','SibSp'], axis=1, inplace=True)
    if train == "train_df":
        df = df[df['Fare'] < 85]
        df = df[df['Age'] < 65]
    df['Embarked'].fillna("S", inplace=True)
    df=pd.get_dummies(df,columns=['Embarked'],drop_first=True)
    df=pd.get_dummies(df,columns=['Sex'],drop_first=True)
    df['name_title'] = le.fit_transform(df['name_title'])
    age_median= df['Age'].median()
    df['Age'].fillna(age_median, inplace=True)
    fare_median= df['Fare'].median()
    df['Fare'].fillna(fare_median, inplace=True)
    df["fare_group"]=df['Fare'].apply(lambda x : int(x))
    df['age_group'] = df['Age'].apply(age_to_group)
    df['fare_group'] = df['fare_group'].apply(fare_grop_div)
    df['fare_group'].fillna(0,inplace=True)
    req_cols=['age_group','name_title','fare_group']
    count_enc =ce.CountEncoder(cols=req_cols)
    count_enc.fit(df[req_cols])
    df=df.join(count_enc.transform(df[req_cols]).add_suffix('_count'))
    print(df.head())
    
    try:
        y=df['Survived']
        print(y)
    except:
        y='nope'
        
    print(y)
    
    
#     x=df.drop(['Ticket','Fare','Age','age_group','name_title','fare_group'], axis=1, inplace=True)
    
    if 'nope' not in y:
        x=df.drop(['Ticket','Survived','Fare','Age','age_group','name_title','fare_group'], axis=1, inplace=True)
    else:
        x=df.drop(['Ticket','Fare','Age','age_group','name_title','fare_group'], axis=1, inplace=True)
    return df,y

train_df,y=process_data(train_df,'train_df')


X_train, X_test, y_train, y_test = train_test_split(train_df,y, test_size=0.2,random_state=105)

feature_sel_model = SelectFromModel(Lasso(alpha=0.005, random_state=0)) # remember to set the seed, the random state in this function
feature_sel_model.fit(X_train, y_train)




name_values
Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Col           2
Mlle          2
Major         2
Don           1
Lady          1
Mme           1
Sir           1
Ms            1
Capt          1
Jonkheer      1
Countess      1
Name: name_title, dtype: int64
   Survived  Pclass   Age            Ticket     Fare  Cabin  \
0         0       3  22.0         A/5 21171   7.2500      0   
1         1       1  38.0          PC 17599  71.2833      1   
2         1       3  26.0  STON/O2. 3101282   7.9250      0   
3         1       1  35.0            113803  53.1000      1   
4         0       3  35.0            373450   8.0500      0   

   total_family_members  name_title  Embarked_Q  Embarked_S  Sex_male  \
0                     2          10           0           1         1   
1                     2          11           0           0         0   
2                     1           7           0           1         0   
3             

  X.loc[:, self.cols] = X.fillna(value=pd.np.nan)


SelectFromModel(estimator=Lasso(alpha=0.005, copy_X=True, fit_intercept=True,
                                max_iter=1000, normalize=False, positive=False,
                                precompute=False, random_state=0,
                                selection='cyclic', tol=0.0001,
                                warm_start=False),
                max_features=None, norm_order=1, prefit=False, threshold=None)

In [105]:
model =RandomForestClassifier(criterion = "gini", 
                                       min_samples_leaf = 1, 
                                       min_samples_split = 10,   
                                       n_estimators=260, 
                                       max_features='auto', 
                                       oob_score=True, 
                                       random_state=42,warm_start=True, 
                                       n_jobs=-1)
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=260,
                       n_jobs=-1, oob_score=True, random_state=42, verbose=0,
                       warm_start=True)

In [106]:
val_predictions =model.predict(X_test)
print('number of value in validation_',len(val_predictions))
accuracy = accuracy_score(y_test, val_predictions)
print('validation_accuracy',accuracy)
print('confusion matrix_',confusion_matrix(y_test, val_predictions))


number of value in validation_ 129
validation_accuracy 0.8527131782945736
confusion matrix_ [[78  4]
 [15 32]]


In [108]:
test_df=pd.read_csv('test.csv')
def get_cabin(cabin_no):
    if cabin_no=="U":
        return 0
    else:
        return 1  
test_df['Cabin'].fillna('U',inplace=True)
test_df['Cabin']=test_df['Cabin'].apply(get_cabin)


age_median= test_df['Fare'].median()
test_df['Fare'].fillna(age_median, inplace=True)

df_mini_orr=test_df[['PassengerId','Pclass','Age','Fare','Cabin']]

df_mini = df_mini_orr.copy()


is_NaN = df_mini.isnull()
row_has_NaN = is_NaN.any(axis=1)
test_data = df_mini[row_has_NaN]

test_data.head()

df_mini.dropna(inplace=True)
test_data.head()
df_mini.info()

train_labels=df_mini['Age']
train_features=df_mini.drop(['PassengerId','Age'],axis=1)
test_labels=test_data['Age']
test_features=test_data.drop(['PassengerId','Age'],axis=1)

scaler = StandardScaler() 
X_train_scaled = scaler.fit_transform(train_features)
X_test_scaled = scaler.transform(test_features)

model_test =RandomForestRegressor(bootstrap= True,
                             max_features= 'sqrt',
                            min_samples_split= 8,
                            n_estimators=20)

model_test.fit(X_train_scaled,train_labels)
val_predictions =model_test.predict(X_test_scaled)
print(val_predictions)
final_test_df=pd.DataFrame({'PassengerId':test_data['PassengerId'],'Age1':val_predictions})
final_test_df.head()
test_df=test_df.set_index('PassengerId').join(final_test_df.set_index('PassengerId'))
test_df['Age'].fillna(test_df['Age1'],inplace=True)
test_df.drop(['Age1'], axis=1,inplace=True)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 332 entries, 0 to 415
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  332 non-null    int64  
 1   Pclass       332 non-null    int64  
 2   Age          332 non-null    float64
 3   Fare         332 non-null    float64
 4   Cabin        332 non-null    int64  
dtypes: float64(2), int64(3)
memory usage: 15.6 KB


<IPython.core.display.Javascript object>

[26.10337898 38.53640693 24.03472955 21.84669545 23.55805973 20.78267857
 43.99007937 26.20317766 25.41314103 22.88982932 30.72316447 23.55805973
 26.10337898 24.62248009 31.88959524 26.20317766 26.04774281 23.55805973
 26.20317766 26.20317766 22.3580969  26.04774281 30.06897727 26.20317766
 26.20317766 24.46177121 26.06210956 30.06897727 43.56395854 37.92003274
 26.10337898 25.4678824  31.61449009 41.78511141 28.24587461 28.31256459
 26.20317766 19.9347619  39.75836607 23.55805973 24.04827972 41.76605838
 29.38314089 24.12918193 23.55805973 13.67760728 26.20317766 24.12918193
 26.10337898 21.84669545 31.88959524 28.24587461 26.20317766 26.10337898
 44.18511364 28.24587461 23.55805973 26.20317766 24.04827972 26.62598991
 26.20317766 30.42719246 22.74746032 23.55805973 35.24269481 28.31256459
 24.03472955 24.62103175 26.20317766 28.88337461 26.62598991 28.31256459
 19.9347619  29.01059524 24.12918193 26.20317766 19.9347619  31.88959524
 26.20317766 29.01059524 32.13438827 23.8178824  26

In [109]:
test_df.to_csv('test_formated.csv')

In [110]:
test_df=pd.read_csv('test_formated.csv')

In [111]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          418 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         418 non-null    float64
 9   Cabin        418 non-null    int64  
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 36.0+ KB


In [113]:
test_df,y=process_data(test_df,'test_df')

name_values
Mr        240
Miss       78
Mrs        72
Master     21
Rev         2
Col         2
Dr          1
Dona        1
Ms          1
Name: name_title, dtype: int64
   Pclass   Age   Ticket     Fare  Cabin  total_family_members  name_title  \
0       3  34.5   330911   7.8292      0                     1           5   
1       3  47.0   363272   7.0000      0                     2           6   
2       2  62.0   240276   9.6875      0                     1           5   
3       3  27.0   315154   8.6625      0                     1           5   
4       3  22.0  3101298  12.2875      0                     3           6   

   Embarked_Q  Embarked_S  Sex_male  fare_group  age_group  age_group_count  \
0           1           0         1         0.0          6               49   
1           0           1         0         0.0          7               51   
2           1           0         1         0.0          8               36   
3           0           1         1         0.

  X.loc[:, self.cols] = X.fillna(value=pd.np.nan)


In [114]:
test_df.head()


Unnamed: 0,Pclass,Cabin,total_family_members,Embarked_Q,Embarked_S,Sex_male,age_group_count,name_title_count,fare_group_count
0,3,0,1,1,0,1,49,240,155
1,3,0,2,0,1,0,51,72,155
2,2,0,1,1,0,1,36,240,155
3,3,0,1,0,1,1,107,240,155
4,3,0,3,0,1,0,101,72,83


In [115]:
try:
    test_df.drop('Unnamed: 0', axis=1,inplace=True)
except:
    pass

In [117]:
test_predictions =model.predict(test_df)

In [118]:
len(test_predictions)

418

In [124]:
df2=pd.read_csv('test.csv')
p_ID=df2["PassengerId"].tolist()
df_sub=pd.DataFrame()
df_sub["PassengerId"]=p_ID
df_sub["Survived"]=test_predictions
df_sub.set_index('PassengerId')

df_sub.to_csv("predictions.csv")

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,1
3,895,0
4,896,1
