# Kaggle Titanic competition
goal: create fast and submit to learn how to deal with Kaggle


In [1]:
#imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import preprocessing


## data loading

In [2]:
train_dataset=pd.read_csv('train.csv')
test_dataset=pd.read_csv('test.csv')

## data analysis

In [27]:
train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [None]:
train_dataset.describe()

In [None]:
train_dataset.head()


In [None]:
train_dataset.hist(figsize=(15,20));
plt.figure();

In [None]:
sns.pairplot(train_dataset, hue="Survived");

In [None]:
sns.heatmap(train_dataset.corr(),annot=True) 


In [39]:
test_dataset.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

## data preprocessing

In [53]:
def preprocess_data(source_df):
    df=source_df.copy()
    # Age to bins, NaN to separate bin
    df.Age.fillna(-1, inplace=True)
    age_bins=(-10,0,1,2,4,8,16,32,64,500)
    age_labels=('age_unk','age_upto1','age_upto2','age_upto4','age_upto8','age_upto16','age_upto32','age_upto64','age_above64')
    df.Age=pd.cut(df.Age, age_bins, labels=age_labels)
    #cabin - keep first letter
    df.Cabin.fillna('Unknown', inplace=True)
    df.Cabin=df.Cabin.apply(lambda x: x[0])
    # fare - divide by mean (~32)
    df.Fare.fillna(df.Fare.median(), inplace=True)
    df.Fare=df.Fare.apply(lambda x: x/32)
    # embarked - fillna
    df.Embarked.fillna('U', inplace=True)
    # ticket, name - drop
    df=df.drop(['Ticket','Name'], axis=1)
    
    #return preprocessed df
    return df


In [54]:
df_test=preprocess_data(test_dataset)
df_train=preprocess_data(train_dataset)

In [57]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Sex            891 non-null object
Age            891 non-null category
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null float64
Cabin          891 non-null object
Embarked       891 non-null object
dtypes: category(1), float64(1), int64(5), object(3)
memory usage: 64.0+ KB


In [58]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 9 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Sex            418 non-null object
Age            418 non-null category
SibSp          418 non-null int64
Parch          418 non-null int64
Fare           418 non-null float64
Cabin          418 non-null object
Embarked       418 non-null object
dtypes: category(1), float64(1), int64(4), object(3)
memory usage: 27.0+ KB


In [56]:
df_train.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [49]:
def encode(df1, df2):
    '''df1, df2 - train and test datasets'''
    cat_features =['Sex','Pclass','Cabin','Embarked','Age']
    onehot_features=['Cabin','Embarked']
    
    features=onehot_features+cat_features
    df_combined = pd.concat([df1[features],df2[features]])
    #oh_encoder = preprocessing.OneHotEncoder()
    for f in cat_features:
        print(f)
        l_encoder = preprocessing.LabelEncoder()
        l_encoder.fit(df_combined[f])
        df1[f]=l_encoder.transform(df1[f])
        df2[f]=l_encoder.transform(df2[f])
    
    dummies1 = pd.get_dummies(df1,columns=onehot_features)
    dummies2 = pd.get_dummies(df2,columns=onehot_features)
        
    
    return df1,df2
        
    

In [50]:
df_test, df_train = encode(df_test, df_train)

Sex
Pclass
Cabin


ValueError: bad input shape (1309, 2)

In [24]:
features =['Sex','Age','Pclass','Cabin','Embarked']

In [None]:
df_train[features]



In [25]:
df_combined = pd.concat([df_train[features],df_test[features]])

In [14]:
df_combined.head()


Unnamed: 0,Sex,Age,Pclass,Cabin,Embarked
0,1,age_upto32,2,U,S
1,0,age_upto64,0,C,C
2,0,age_upto32,2,U,S
3,0,age_upto64,0,C,S
4,1,age_upto64,2,U,S


In [26]:
df_combined.Cabin.value_counts()

U    687
7    327
C     59
B     47
2     35
D     33
E     32
1     18
A     15
F     13
3     13
4      9
5      8
0      7
G      4
6      1
T      1
Name: Cabin, dtype: int64

In [None]:
df_combined.Age=df_combined.Age.astype('str')

In [None]:
l_encoder = preprocessing.LabelEncoder()
l_encoder.fit(df_combined['Age'])

In [17]:
df_combined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 5 columns):
Sex         1309 non-null int32
Age         1296 non-null category
Pclass      1309 non-null int64
Cabin       1309 non-null object
Embarked    1309 non-null object
dtypes: category(1), int32(1), int64(1), object(2)
memory usage: 47.7+ KB
