In [130]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, svm
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.tree import DecisionTreeRegressor

In [131]:
def load_df(f):
    df = pd.read_csv(f)
    df.Embarked = df.Embarked.astype(dtype='category')
    df.Sex = df.Sex.astype(dtype='category')
    df.Pclass = df.Pclass.astype(dtype='category')
    return df.loc[:, ('Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked')]

dfX = load_df('train.csv')
X = pd.get_dummies(dfX)

df = pd.read_csv('train.csv')
Y = df.Survived

**Like BittenByPython**

In [132]:
df = pd.read_csv('train.csv')

labels = df["Survived"]

def to_binstd(df):
    # Drop some dimensions
    drops = ["PassengerId", "Name", "Ticket", "Cabin"]
    if "Survived" in df.columns:
        drops.append("Survived")
        
    df = df.drop(drops, axis=1)

    # Convert categorical data to binary data
    df = pd.get_dummies(df, columns=["Pclass", "Sex", "Embarked"])

    # Fix NaNs
    imputer = Imputer(strategy="median")
    X = imputer.fit_transform(df)
    df = pd.DataFrame(X, columns=df.columns)

    # Normalize data
    std = StandardScaler()
    X = std.fit_transform(df)
    df = pd.DataFrame(X, columns=df.columns)
    
    return df

def write_predic(filepath, df, pred):
    with open(filepath, "w") as f:
        f.write("PassengerId,Survived\n")
        f.write("\n".join([','.join([str(i), str(j)]) for i, j in zip(df.PassengerId, pred)]))

df = to_binstd(df)

# Apply SVM
cls_svm = svm.SVC()
cls_svm.fit(df, labels)

svm_score = cross_val_score(cls_svm, df, labels, scoring="neg_mean_squared_error", cv=10)
svm_rmse = np.sqrt(-svm_score)
print("SVM")
print(svm_rmse)
print("mean:", svm_rmse.mean())
print("stdev:", svm_rmse.std())


# Apply LinReg
linreg = LinearRegression()
linreg.fit(df, labels)

linreg_score = cross_val_score(linreg, df, labels, scoring="neg_mean_squared_error", cv=10)
linreg_rmse = np.sqrt(-linreg_score)
print("Linear regression")
print(linreg_rmse)
print("mean:", linreg_rmse.mean())
print("stdev:", linreg_rmse.std())

# Prediction
dftest_init = pd.read_csv("test.csv")
dftest = to_binstd(dftest_init)
pred = cls_svm.predict(dftest)
write_predic("/tmp/predic.csv", dftest_init, pred)

SVM
[0.40824829 0.43461349 0.46204236 0.36719404 0.36719404 0.42399915
 0.42399915 0.47404546 0.38218767 0.39886202]
mean: 0.4142385677288476
stdev: 0.034919479912234534
Linear regression
[0.38521106 0.37751059 0.39821157 0.39285864 0.39030777 0.37441657
 0.39442241 0.4056701  0.33142829 0.37352259]
mean: 0.3823559603081141
stdev: 0.0196987452041817


** own **

In [133]:
dfX.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
Pclass      891 non-null category
Sex         891 non-null category
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Embarked    889 non-null category
dtypes: category(3), float64(2), int64(2)
memory usage: 30.8 KB


In [134]:
dfX.isna().sum()

Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [135]:
imp = Imputer(missing_values='NaN', strategy='mean')
imp.fit(X)
Xts = imp.transform(X)

In [136]:
cls = svm.SVC()
cls.fit(Xts, Y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [137]:
test_data = load_df('test.csv')
Xtest = pd.get_dummies(test_data)
Xtest_ts = imp.transform(Xtest) 

In [138]:
pred = cls.predict(Xtest_ts)