forked from SophMC/notechain
-
Notifications
You must be signed in to change notification settings - Fork 0
/
clean_test_53.py
133 lines (106 loc) · 4.93 KB
/
clean_test_53.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# This script will clean up the test data for the Titanic competition using a
# similar method to the notebook https://www.kaggle.com/creepykoala/
# titanic/study-of-tree-and-forest-algorithms which I have already applied to
# the training data
# Import libraries
import numpy as np
from numpy.random import random_integers
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from scipy.stats import pointbiserialr, spearmanr
# Load the test data
df = pd.read_csv('/home/sophie/projects/Titanic/data/test.csv', header=0)
# People with stronger titles tend to have more help on board. Hence, we will
# categorize passengers based on titles.
Title_Dictionary = {
"Capt": "Officer",
"Col": "Officer",
"Major": "Officer",
"Jonkheer": "Royalty",
"Don": "Royalty",
"Sir" : "Royalty",
"Dr": "Officer",
"Rev": "Officer",
"the Countess":"Royalty",
"Dona": "Royalty",
"Mme": "Mrs",
"Mlle": "Miss",
"Ms": "Mrs",
"Mr" : "Mr",
"Mrs" : "Mrs",
"Miss" : "Miss",
"Master" : "Master",
"Lady" : "Royalty"
}
df['Title'] = df['Name'].apply(lambda x:
Title_Dictionary[x.split(',')[1].split('.')[0].strip()])
# Extract the letters from the beginning of each element of 'Ticket'
def Ticket_Prefix(s):
s=s.split()[0] # if you don't include anything in split()
if s.isdigit():
return 'NoClue'
else:
return s
df['TicketPrefix'] = df['Ticket'].apply(lambda x: Ticket_Prefix(x))
# Make an array where null values are False.
mask_Age = df.Age.notnull()
# New dataframe where all rows have a value for age.
Age_Sex_Title_Pclass = df.loc[mask_Age, ["Age", "Title", "Sex", "Pclass"]]
# Groupby object to group by Title, Pclass and Sex
Filler_Ages_1 = Age_Sex_Title_Pclass.groupby(by = ["Title", "Pclass",
"Sex"]).median()
# This moves both Sex and Pclass into column headers and does so in that order.
Filler_Ages = Filler_Ages_1.Age.unstack(level = -1).unstack(level = -1)
mask_Age = df.Age.isnull() # A mask where null values are True
# New DataFrame with missing values for age
Age_Sex_Title_Pclass_missing = df.loc[mask_Age, ["Title", "Sex", "Pclass"]]
# Look-up function for the calculated median ages.
def Age_filler(row):
if row.Sex == "female":
age = Filler_Ages.female.loc[row["Title"], row["Pclass"]]
return age
elif row.Sex == "male":
age = Filler_Ages.male.loc[row["Title"], row["Pclass"]]
return age
# Make a new column on "missing" dataframe and add the median value to each
# row.
Age_Sex_Title_Pclass_missing["Age"]=Age_Sex_Title_Pclass_missing.apply(
Age_filler, axis =1 )
# reform the 'Age' column.
df["Age"] = pd.concat([Age_Sex_Title_Pclass["Age"],
Age_Sex_Title_Pclass_missing["Age"]])
# Filling in with the mean of all fares.
df['Fare'] = df['Fare'].fillna(value=df.Fare.mean())
df['FamilySize'] = df['SibSp'] + df['Parch']
df = df.drop(['Ticket', 'Cabin'], axis=1)
# get_dummies splits up a column into two seperate columns of 1 and 0, where
# they are true or false.
dummies_Sex = pd.get_dummies(df['Sex'], prefix='Sex')
# Making dummies for the other categorical features
dummies_Embarked = pd.get_dummies(df['Embarked'], prefix = 'Embarked')
dummies_Pclass = pd.get_dummies(df['Pclass'], prefix = 'Pclass')
dummies_Titles = pd.get_dummies(df['Title'], prefix= 'Title')
dummies_TicketPrefix = pd.get_dummies(df['TicketPrefix'], prefix='TicketPrefix')
# Make new dataframes which have the dummies added on to the end
df = pd.concat([df,dummies_Sex, dummies_Embarked, dummies_Pclass,
dummies_Titles, dummies_TicketPrefix], axis = 1)
# Drop the categorical data
df = df.drop(['Sex', 'Embarked','Pclass','Title','Name','TicketPrefix'], axis=1)
# Set PassengerId as the index:
df = df.set_index(['PassengerId'])
# FEATURE SELECTION
# To select features we correlate each feature against Survived in the
# training data. We need # to use different algorithms for the different data
# types:
# - Spearman-Rank correlation for nominal vs nominal data
# - Point-Biserial correlation for nominal vs continuous data
best_features = df[['Title_Mr', 'Sex_male', 'Sex_female', 'Title_Mrs',
'Title_Miss', 'Pclass_3', 'Pclass_1', 'Fare', 'Embarked_C',
'Embarked_S']]
#Output this to csv to be read in for making a prediction
best_features.to_csv('/home/sophie/projects/Titanic/data/clean_test_53.csv',
sep = " ")