In [1]:
import math
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import io
from scipy import stats

## Data Preprocessor

In [2]:
####DATA PREP####
from sklearn.preprocessing import Imputer, MinMaxScaler
def str_to_int(df):
    str_columns = df.select_dtypes(['object']).columns
    for col in str_columns:
        df[col] = df[col].astype('category')

    cat_columns = df.select_dtypes(['category']).columns
    df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)
    return df

def count_space_except_nan(x):
    if isinstance(x,str):
        return x.count(" ") + 1
    else :
        return 0
def encode(data, cols):
    for i in cols:
        temp = pd.get_dummies(data[i], prefix=i)
        data = pd.concat([data, temp], axis=1)
        data.drop([data.keys()[-1],i], inplace = True, axis=1)
    return data
def fillnan(data,cols):
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    data[cols] =imp.fit_transform(data[cols])
    return data
def normalize_age(data,cols):
    scaler = MinMaxScaler()
    for i in cols:
        data[i] = scaler.fit_transform(data[i].values.reshape(-1,1))
    return data

In [3]:
####DATA CONVERSTION###
def split(d):
    X,y= d.iloc[:,1:],d.iloc[:,0]
    return X,y
def convert(X,y):
    X = X.values
    y = y.values[np.newaxis].T
    return X,y

In [4]:
###Processing DATA###
def Preprocess(Data,ttype):
    Data = Data.drop(["Name","Ticket","PassengerId","Cabin"],axis=1)
    if(ttype == "train"):
        X,y= split(Data)
        X = encode(X,['Sex','Embarked','SibSp','Parch','Pclass'])
        X = fillnan(X,['Age','Fare']) 
        X = normalize_age(X,['Age','Fare'])
        X,y = convert(X,y)
        return X,y
    else:
        X = Data
        X = encode(X,['Sex','Embarked','SibSp','Parch','Pclass'])
        X = fillnan(X,['Age','Fare']) 
        X = normalize_age(X,['Age','Fare'])
        X.drop(['Parch_6'],inplace = True,axis=1)
        X = X.values
        return X

In [5]:
d_tr = pd.read_csv("D:/Programming/GitHub/Titanic-Kaggle_TF/train.csv")
X_tr,y_tr = Preprocess(d_tr,"train")
print(X_tr.shape, y_tr.shape)

(891, 19) (891, 1)


In [6]:
d_te = pd.read_csv("D:/Programming/GitHub/Titanic-Kaggle_TF/test.csv")
X_te = Preprocess(d_te,"test")
print(X_te.shape)

(418, 19)


In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_tr, y_tr, test_size=0.2,train_size = 0.8, random_state=42)

## SKlearn Modules

In [10]:
from sklearn.metrics import accuracy_score, log_loss
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [17]:
clf = LinearDiscriminantAnalysis()
clf.fit(X_train, np.squeeze(y_train))

LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)

In [22]:
acc = accuracy_score(y_test, clf.predict(X_test))
acc

0.8044692737430168

In [28]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=5, solver='lbfgs',multi_class='multinomial',n_jobs=2,max_iter=10000).fit(X_train, np.squeeze(y_train))
acc = accuracy_score(y_test, clf.predict(X_test))
acc

0.7988826815642458

In [13]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=2000, random_state=0,max_features="sqrt",verbose=True).fit(X_train,y_train.ravel())
acc = accuracy_score(y_test, clf.predict(X_test))
acc

  from numpy.core.umath_tests import inner1d
[Parallel(n_jobs=1)]: Done 2000 out of 2000 | elapsed:    1.3s finished
[Parallel(n_jobs=1)]: Done 2000 out of 2000 | elapsed:    0.1s finished


0.8100558659217877

In [10]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, log_loss
clf = SVC(kernel='rbf', max_iter=-1,gamma='auto',verbose=True)
clf.fit(X_train, y_train.ravel())
acc = accuracy_score(y_test, clf.predict(X_test))
acc

[LibSVM]

0.7821229050279329

In [14]:
from sklearn.metrics import r2_score
y_pred = clf.predict(X_test)
r2_score(y_test, y_pred)

[Parallel(n_jobs=1)]: Done 2000 out of 2000 | elapsed:    0.1s finished


0.2167310167310167