In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold

In [2]:
%%capture
%run _prepare_data.ipynb

In [3]:
import warnings
warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")

In [4]:
titanic = train

# The columns we'll use to predict the target
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

# Initialize our algorithm class
linear = LinearRegression()

# Generate cross validation folds for the train dataset.  It return the row indices corresponding to train and test.
# We set random_state to ensure we get the same splits every time we run this.
kf = KFold(n_splits=3, random_state=1)

predictions = []
for train_index, test_index in kf.split(titanic):    
    # The predictors we're using the train the algorithm.  Note how we only take the rows in the train folds.
    train_predictors = (titanic[predictors].iloc[train_index, :])
    # The target we're using to train the algorithm.
    train_target = titanic["Survived"].iloc[train_index]
    # Training the algorithm using the predictors and target.
    linear.fit(train_predictors, train_target)
    # We can now make predictions on the test fold
    test_predictions = linear.predict(titanic[predictors].iloc[test_index, :])
    predictions.append(test_predictions)

In [5]:
predictions

[array([  8.99877810e-02,   9.60756206e-01,   5.92676278e-01,
          9.31138728e-01,   5.29343071e-02,   1.70275685e-01,
          3.69943590e-01,   1.03474847e-01,   5.21597906e-01,
          8.74491050e-01,   6.48883611e-01,   8.29742769e-01,
          1.34797198e-01,  -1.61126844e-01,   6.58141307e-01,
          6.39819748e-01,   1.51733875e-01,   2.95432718e-01,
          5.35377959e-01,   6.21007683e-01,   2.61872592e-01,
          2.62687561e-01,   7.31739160e-01,   5.05995897e-01,
          5.61398567e-01,   3.35039734e-01,   1.30338808e-01,
          4.68765767e-01,   6.60737753e-01,   9.10819218e-02,
          4.77223920e-01,   1.04220026e+00,   6.60691613e-01,
          8.71539273e-02,   5.28550732e-01,   4.01874338e-01,
          1.30340307e-01,   1.29339672e-01,   5.72717129e-01,
          6.65238822e-01,   4.83215779e-01,   7.60807408e-01,
          1.30578363e-01,   8.71867121e-01,   7.09855487e-01,
          9.11369897e-02,   1.39181745e-01,   6.60691613e-01,
        

In [6]:
# The predictions are in three separate numpy arrays.  Concatenate them into one.  
# We concatenate them on axis 0, as they only have one axis.
predictions = np.concatenate(predictions, axis=0)

In [7]:
# Map predictions to outcomes (only possible outcomes are 1 and 0)
predictions[predictions > .5] = 1
predictions[predictions <=.5] = 0

In [8]:
accuracy = sum(predictions[predictions == titanic["Survived"]]) / len(predictions)

  if __name__ == '__main__':


In [9]:
accuracy

0.78338945005611671

In [10]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",0,34.5,0,0,330911,7.8292,,2
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,363272,7.0,,0
2,894,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,240276,9.6875,,2
3,895,3,"Wirz, Mr. Albert",0,27.0,0,0,315154,8.6625,,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,3101298,12.2875,,0


In [11]:
prediction = linear.predict(test[predictors])
prediction[prediction > .5] = 1
prediction[prediction <= .5] = 0
print(prediction)

[ 0.  0.  0.  0.  1.  0.  1.  0.  1.  0.  0.  0.  1.  0.  1.  1.  0.  0.
  1.  1.  0.  0.  1.  1.  1.  0.  1.  0.  0.  0.  0.  0.  1.  1.  0.  0.
  1.  1.  0.  0.  0.  0.  0.  1.  1.  0.  0.  0.  1.  1.  0.  0.  1.  1.
  0.  0.  0.  0.  0.  1.  0.  0.  0.  1.  0.  1.  1.  0.  0.  1.  1.  0.
  1.  0.  1.  0.  0.  1.  0.  1.  0.  0.  0.  0.  0.  0.  1.  1.  1.  0.
  1.  0.  1.  0.  0.  0.  1.  0.  1.  0.  1.  0.  0.  0.  1.  0.  0.  0.
  0.  0.  0.  1.  1.  1.  1.  0.  0.  1.  0.  1.  1.  0.  1.  0.  0.  1.
  0.  1.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  1.  0.  1.  1.  0.  0.
  0.  0.  0.  0.  0.  0.  1.  0.  0.  1.  0.  0.  1.  1.  0.  1.  1.  0.
  1.  0.  0.  1.  0.  0.  1.  1.  0.  0.  0.  0.  0.  1.  1.  0.  1.  1.
  0.  0.  1.  0.  1.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  1.
  0.  1.  1.  0.  0.  1.  0.  0.  1.  0.  1.  0.  0.  0.  0.  1.  0.  0.
  1.  0.  1.  0.  1.  0.  1.  0.  1.  1.  0.  1.  0.  0.  0.  1.  0.  0.
  0.  0.  0.  0.  1.  1.  1.  1.  0.  0.  0.  0.  1

In [12]:
prediction = prediction.astype(int)
print(prediction)

[0 0 0 0 1 0 1 0 1 0 0 0 1 0 1 1 0 0 1 1 0 0 1 1 1 0 1 0 0 0 0 0 1 1 0 0 1
 1 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 0 1 1 0 0 1 1 0 1 0
 1 0 0 1 0 1 0 0 0 0 0 0 1 1 1 0 1 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 1 1 0 0 1 0 1 1 0 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0
 0 0 1 0 0 1 0 0 1 1 0 1 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1
 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 1 0 0 1 0 0 1 0 1 0 0 0 0 1 0 0 1 0 1 0 1 0
 1 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0
 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0
 1 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 1 0 0 0 1 0 1 0 0 1 0 1 1 0 1 1 0 1 1 0
 0 1 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 0 1 1 1 1 1 0 1 0 0 0]


In [13]:
solution = pd.DataFrame(prediction, np.array(test['PassengerId']), columns=['Survived'])

In [14]:
solution.head()

Unnamed: 0,Survived
892,0
893,0
894,0
895,0
896,1


In [15]:
len(solution)

418

In [16]:
solution.to_csv('linear_regression.csv', index_label='PassengerId')