In [27]:
#IMPORTING ALL THE PACKAGES
import numpy as np
import pandas as pd
import sklearn.linear_model as lm
import statsmodels.formula.api as smf

%matplotlib inline
from sklearn.metrics import accuracy_score

## 1. Loading the Data

In [28]:
data_titanic = pd.read_csv('titanic.csv.bz2')
data_titanic.head()
target_data = data_titanic['survived']



## 2. LPM Model

#Estimate an LPM model
mod_titanic = smf.ols(formula='survived ~ C(pclass) + sex', data=data_titanic)
result = mod_titanic.fit()
print(result.summary())

In [29]:
predictions = result.predict(data)   #predicting the survival rates
predictions

0       0.898984
1       0.394113
2       0.898984
3       0.394113
4       0.898984
5       0.394113
6       0.898984
7       0.394113
8       0.898984
9       0.394113
10      0.394113
11      0.898984
12      0.898984
13      0.898984
14      0.394113
15      0.394113
16      0.394113
17      0.898984
18      0.898984
19      0.394113
20      0.394113
21      0.898984
22      0.394113
23      0.898984
24      0.898984
25      0.394113
26      0.394113
27      0.898984
28      0.898984
29      0.394113
          ...   
1279    0.606349
1280    0.101478
1281    0.101478
1282    0.101478
1283    0.101478
1284    0.101478
1285    0.101478
1286    0.606349
1287    0.101478
1288    0.101478
1289    0.101478
1290    0.606349
1291    0.101478
1292    0.101478
1293    0.101478
1294    0.101478
1295    0.101478
1296    0.101478
1297    0.101478
1298    0.101478
1299    0.101478
1300    0.606349
1301    0.101478
1302    0.101478
1303    0.101478
1304    0.606349
1305    0.606349
1306    0.1014

## 3. Max & Min of survival predictions

In [30]:
print(predictions.max())
print(predictions.min())

0.8989839322447716
0.10147811470358636


## 4. Conditional Prediction

In [31]:
for i in range(0,len(predictions)):
    if predictions[i] >= 0.5:
        predictions[i] = 1
    else:
        predictions[i] = 0

## 5. Compare actual and predicted values

In [32]:
print(accuracy_score(target_data,predictions))

0.7799847211611918


## 6. Estimate similar model 

In [35]:
passenger_class_effect_model = smf.ols(formula='survived ~ C(pclass)*sex', data=data_titanic)
result_class_effect_model = passenger_class_effect_model.fit()
print(result_class_effect_model.summary())

                            OLS Regression Results                            
Dep. Variable:               survived   R-squared:                       0.370
Model:                            OLS   Adj. R-squared:                  0.368
Method:                 Least Squares   F-statistic:                     153.2
Date:                Wed, 24 Apr 2019   Prob (F-statistic):          4.02e-128
Time:                        15:20:44   Log-Likelihood:                -609.86
No. Observations:                1309   AIC:                             1232.
Df Residuals:                    1303   BIC:                             1263.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                                 coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------
Intercept           

## 7. Interpretation

The highest survival rate is for 1st class females. On the contrary, 3rd class females have a very low survival rate 

## 8. Accuracy 

In [39]:
predictions_2 = result_class_effect_model.predict(data_titanic)
for i in range(0,len(predictions_2)):
    if predictions_2[i] >= 0.5:
        predictions_2[i] = 1
    else:
        predictions_2[i] = 0

In [40]:
accuracy_score(predictions_2,target_data)

0.7830404889228418

The accuracy has increased by 1% and the R-square has also increased.