In [418]:
import os
import pandas as pd

TRAINING_FILE = "train.csv"
TESTING_FILE = "test.csv"

def load_data(file, header=True):
    csv_path = os.path.join("", file)
    if header:
        return pd.read_csv(csv_path)
    else:
        return pd.read_csv(csv_path, header=None)


data = load_data(TRAINING_FILE)
test_data = load_data(TESTING_FILE)
datacopy = data.copy()

In [419]:
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
import pandas as pd

def age_filler(data, test_data):
    agedata = data.copy()
    test_agedata = test_data.copy()
    
    agedata.drop("Survived", axis=1, inplace=True)
    sex_encoder = LabelBinarizer()
    sex_encoder.fit(agedata["Sex"])
        
    for set in (agedata, test_agedata):
        set.drop("Name", axis=1, inplace=True)
        set.drop("Ticket", axis=1, inplace=True)
        set.drop("Cabin", axis=1, inplace=True)
        set.drop("Embarked", axis=1, inplace=True)
        set["Gender"] = sex_encoder.transform(set["Sex"])
        set.drop("Sex", axis=1, inplace=True)
    
    X_ = agedata[agedata["Age"].notnull()]
    Y = X_["Age"].values
    X_ = X_.drop("Age", axis=1)
    X_ = X_.drop("PassengerId", axis=1)
    X = X_[list(X_)].values
    scalar = StandardScaler()
    scalar.fit(X)
    X = pd.DataFrame(scalar.transform(X), columns=X_.columns)
    
    tree_reg = DecisionTreeRegressor()
    tree_reg.fit(X, Y)

    dfs = []
    for set in (agedata, test_agedata):
        test_X_ = set[set["Age"].isnull()]
        ids = test_X_["PassengerId"].copy()
        test_X_ = test_X_.drop("Age", axis=1)
        test_X_ = test_X_.drop("PassengerId", axis=1)
        test_X = test_X_[list(test_X_)].values
        test_X = pd.DataFrame(scalar.transform(test_X), columns=test_X_.columns)

        test_Y = tree_reg.predict(test_X)
    
        df = pd.DataFrame(columns=["PassengerId", "Age_new"])
        df["PassengerId"] = ids.values
        df["Age_new"] = test_Y
        dfs.append(df)
        
    return dfs

dfs = age_filler(data, test_data)

In [420]:
data.drop("Name", axis=1, inplace=True)
data.drop("Ticket", axis=1, inplace=True)
test_data.drop("Name", axis=1, inplace=True)
test_data.drop("Ticket", axis=1, inplace=True)

In [421]:
df_train = dfs[0]
data = pd.merge(left=data, right=df_train, how='left', on='PassengerId')
data["Age_new"].fillna(0, inplace=True)
data["Age"].fillna(0, inplace=True)
data["Age"] = data["Age_new"] + data["Age"]
data.drop("Age_new", axis=1, inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 10 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Sex            891 non-null object
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(3)
memory usage: 76.6+ KB


In [422]:
df_test = dfs[1]
test_data = pd.merge(left=test_data, right=df_test, how='left', on='PassengerId')
test_data["Age_new"].fillna(0, inplace=True)
test_data["Age"].fillna(0, inplace=True)
test_data["Age"] = test_data["Age_new"] + test_data["Age"]
test_data.drop("Age_new", axis=1, inplace=True)
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 0 to 417
Data columns (total 9 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Sex            418 non-null object
Age            418 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(3)
memory usage: 32.7+ KB


In [423]:
data["Fare"].fillna(data["Fare"].median(), inplace=True)
test_data["Fare"].fillna(test_data["Fare"].median(), inplace=True)

In [424]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 10 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Sex            891 non-null object
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(3)
memory usage: 76.6+ KB


In [425]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 0 to 417
Data columns (total 9 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Sex            418 non-null object
Age            418 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Fare           418 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(3)
memory usage: 32.7+ KB


In [426]:
data[data["Embarked"].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
61,62,1,1,female,38.0,0,0,80.0,B28,
829,830,1,1,female,62.0,0,0,80.0,B28,


In [427]:
data["Embarked"].fillna("C", inplace=True)
test_data["Embarked"].fillna("C", inplace=True)

In [428]:
data["Fare_Per_Age"] = data["Fare"] / data["Age"]
test_data["Fare_Per_Age"] = test_data["Fare"] / test_data["Age"]

In [429]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Fare_Per_Age
0,1,0,3,male,22.0,1,0,7.25,,S,0.329545
1,2,1,1,female,38.0,1,0,71.2833,C85,C,1.875876
2,3,1,3,female,26.0,0,0,7.925,,S,0.304808
3,4,1,1,female,35.0,1,0,53.1,C123,S,1.517143
4,5,0,3,male,35.0,0,0,8.05,,S,0.23


In [430]:
%matplotlib inline
import matplotlib.pyplot as plt
#data.hist(bins=50, figsize=(20,15))
#plt.show()

In [431]:
#data.plot(kind="scatter", x="Pclass", y="Fare_Per_Age")

In [432]:
#data.plot(kind="scatter", x="Survived", y="Fare_Per_Age")

In [433]:
#data.plot(kind="scatter", x="Survived", y="Fare")

In [434]:
#data.plot(kind="scatter", x="Survived", y="Age")

In [435]:
data["Cabin"].fillna("Z", inplace=True)
data["Chamber"] = [x[0] for x in data["Cabin"]]
data.drop("Cabin", axis=1, inplace=True)

In [436]:
test_data["Cabin"].fillna("Z", inplace=True)
test_data["Chamber"] = [x[0] for x in test_data["Cabin"]]
test_data.drop("Cabin", axis=1, inplace=True)

In [437]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 0 to 417
Data columns (total 10 columns):
PassengerId     418 non-null int64
Pclass          418 non-null int64
Sex             418 non-null object
Age             418 non-null float64
SibSp           418 non-null int64
Parch           418 non-null int64
Fare            418 non-null float64
Embarked        418 non-null object
Fare_Per_Age    418 non-null float64
Chamber         418 non-null object
dtypes: float64(3), int64(4), object(3)
memory usage: 35.9+ KB


In [438]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 11 columns):
PassengerId     891 non-null int64
Survived        891 non-null int64
Pclass          891 non-null int64
Sex             891 non-null object
Age             891 non-null float64
SibSp           891 non-null int64
Parch           891 non-null int64
Fare            891 non-null float64
Embarked        891 non-null object
Fare_Per_Age    891 non-null float64
Chamber         891 non-null object
dtypes: float64(3), int64(5), object(3)
memory usage: 83.5+ KB


In [439]:
label = data["Survived"].copy()
data.drop("Survived", axis=1, inplace=True)

In [440]:
ids = data["PassengerId"].copy()
data.drop("PassengerId", axis=1, inplace=True)
ids_test = test_data["PassengerId"].copy()
test_data.drop("PassengerId", axis=1, inplace=True)

In [441]:
sex_encoder = LabelBinarizer()
sex_encoder.fit(data["Sex"])

LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)

In [442]:
data["Gender"] = sex_encoder.transform(data["Sex"])
data.drop("Sex", axis=1, inplace=True)
test_data["Gender"] = sex_encoder.transform(test_data["Sex"])
test_data.drop("Sex", axis=1, inplace=True)

In [443]:
data.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Embarked,Fare_Per_Age,Chamber,Gender
0,3,22.0,1,0,7.25,S,0.329545,Z,1
1,1,38.0,1,0,71.2833,C,1.875876,C,0
2,3,26.0,0,0,7.925,S,0.304808,Z,0
3,1,35.0,1,0,53.1,S,1.517143,C,0
4,3,35.0,0,0,8.05,S,0.23,Z,1


In [444]:
test_data.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Embarked,Fare_Per_Age,Chamber,Gender
0,3,34.5,0,0,7.8292,Q,0.226933,Z,1
1,3,47.0,1,0,7.0,S,0.148936,Z,0
2,2,62.0,0,0,9.6875,Q,0.15625,Z,1
3,3,27.0,0,0,8.6625,S,0.320833,Z,1
4,3,22.0,1,1,12.2875,S,0.558523,Z,0


In [445]:
data_Chamber = data["Chamber"]
test_data_Chamber = test_data["Chamber"]
chamber_encoder = LabelBinarizer()
chamber_encoder.fit(data_Chamber)
data.drop("Chamber", axis=1, inplace=True)
test_data.drop("Chamber", axis=1, inplace=True)

In [446]:
data_Embarked = data["Embarked"]
test_data_Embarked = test_data["Embarked"]
embarked_encoder = LabelBinarizer()
embarked_encoder.fit(data_Embarked)
data.drop("Embarked", axis=1, inplace=True)
test_data.drop("Embarked", axis=1, inplace=True)

In [447]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 7 columns):
Pclass          891 non-null int64
Age             891 non-null float64
SibSp           891 non-null int64
Parch           891 non-null int64
Fare            891 non-null float64
Fare_Per_Age    891 non-null float64
Gender          891 non-null int64
dtypes: float64(3), int64(4)
memory usage: 95.7 KB


In [448]:
data.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Fare_Per_Age,Gender
0,3,22.0,1,0,7.25,0.329545,1
1,1,38.0,1,0,71.2833,1.875876,0
2,3,26.0,0,0,7.925,0.304808,0
3,1,35.0,1,0,53.1,1.517143,0
4,3,35.0,0,0,8.05,0.23,1


In [449]:
features = data[list(data)].values

In [450]:
import numpy as np
#np.shape(features)

In [451]:
#np.shape(embarked_encoder.transform(data_Embarked))

In [452]:
#np.shape(chamber_encoder.transform(data_Chamber))

In [453]:
features = np.concatenate((features, embarked_encoder.transform(data_Embarked)), axis=1)
#np.shape(features)

In [454]:
features = np.concatenate((features, chamber_encoder.transform(data_Chamber)), axis=1)
#np.shape(features)

In [455]:
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy="median")
imputer.fit(features)

Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)

In [456]:
features = imputer.transform(features)

In [457]:
scalar = StandardScaler()
scalar.fit(features)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [458]:
features = scalar.transform(features)

In [459]:
test_features = test_data[list(test_data)].values
test_features = np.concatenate((test_features, embarked_encoder.transform(test_data_Embarked)), axis=1)
test_features = np.concatenate((test_features, chamber_encoder.transform(test_data_Chamber)), axis=1)
np.shape(test_features)

(418, 19)

In [460]:
test_features = imputer.transform(test_features)

In [461]:
test_features = scalar.transform(test_features)

In [462]:
#Linear Regression

from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(features, label)

from sklearn.metrics import mean_squared_error

predicted_label = lin_reg.predict(features)
lin_mse = mean_squared_error(label, predicted_label)
lin_rmse = np.sqrt(lin_mse)
print("RMSE on training data", lin_rmse)


#Saving the model
from sklearn.externals import joblib

LINEAR_REG_MODEL_NAME = "linear_classification_model.pk1"
joblib.dump(lin_reg, LINEAR_REG_MODEL_NAME)


# Cross Validation
lin_reg = joblib.load(LINEAR_REG_MODEL_NAME)

from sklearn.model_selection import cross_val_score

scores = cross_val_score(lin_reg, features, label, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)
print("Mean", scores.mean())
print("SD", scores.std())

RMSE on training data 0.3725613134831162
Mean -1.3338529146140013e+22
SD 4.0015587438420036e+22


In [463]:
# Decision tree Regression

from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(features, label)

from sklearn.metrics import mean_squared_error

predicted_label = tree_reg.predict(features)
tree_mse = mean_squared_error(label, predicted_label)
tree_rmse = np.sqrt(tree_mse)
print("RMSE on training data", tree_rmse)


#Saving the model
from sklearn.externals import joblib

TREE_MODEL_NAME = "tree_reg_model.pk1"
joblib.dump(tree_reg, TREE_MODEL_NAME)


# Cross Validation
tree_reg = joblib.load(TREE_MODEL_NAME)

from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, features, label, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)
print("Mean", scores.mean())
print("SD", scores.std())

RMSE on training data 0.10104516787489985
Mean -0.23651688330346698
SD 0.034904813026574076


In [464]:
test_label = np.floor(tree_reg.predict(test_features))

In [465]:
df_result = pd.DataFrame(columns=["PassengerId", "Survived"])
df_result["PassengerId"] = ids_test.values
df_result["Survived"] = test_label

In [466]:
df_result.to_csv("results.csv", encoding='utf-8', index=False)