In [1]:
import numpy as np 
import pandas as pd  
pd.set_option('display.max_column', 100)
import matplotlib.pyplot as plt 
import seaborn as sns 

df = pd.read_csv('titanic_train.csv')
df.shape

(891, 12)

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [3]:
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [7]:
df = df.drop_duplicates()
df['Sex'] = (df['Sex']=='male').astype(int)

In [13]:
df['Age_missing'] = df['Age'].isnull().astype(int)

In [25]:
from sklearn import model_selection 
from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import RandomForestClassifier 

predictors = ['Pclass', 'Sex', 'Age_missing', 'SibSp', 'Parch']
lgr = LogisticRegression(random_state=1)
lgr.fit(df[predictors], df['Survived'])
scores = model_selection.cross_val_score(lgr, df[predictors], df['Survived'])
print('Logistic:', scores)

alg = RandomForestClassifier(random_state=1, n_estimators=100, min_samples_split=4, min_samples_leaf=4)
scores = model_selection.cross_val_score(alg, df[predictors], df['Survived'])
print('random forest:', scores)

Logistic: [0.79124579 0.81818182 0.78114478]
random forest: [0.76094276 0.81818182 0.80808081]


#### ['Pclass', 'Sex']
Logistic: [0.78787879 0.8013468  0.77104377]
random forest: [0.75757576 0.8013468  0.77104377]

['Pclass', 'Sex', 'Age_missing']
Logistic: [0.76767677 0.8013468  0.77104377]
random forest: [0.77777778 0.81818182 0.79124579]



In [86]:
# 正则匹配需要的字符
df['Cabin'] = df.Cabin.str.extract('([A-Z])', expand=False)
df['Cabin'].fillna('Missing', inplace=True)
df['Age_missing'] = df.Age.isnull().astype(int)
df = df.drop('Age_group', axis=1)
df['Age_group'] = pd.cut(df['Age'], [-2, 0,12,50,80])
df.Age_group.unique()
df.Age.fillna(-1, inplace=True)
df = df.drop('Embarked', axis=1)
df = df.drop('PassengerId', axis=1)
df = df.drop('Ticket', axis=1)
df = df.drop('Name', axis=1)
df['Fare'] = df.Fare.apply(np.round)
df['Fare_group'] = pd.cut(df['Fare'], [-1, 12, 32, 600])
# df.groupby('Age')['Survived'].mean()
# plt.figure(figsize=(10,20))
# sns.countplot(y='Fare', data=df)
df = pd.get_dummies(df, columns=['Sex', 'Cabin', 'Fare_group', 'Age_group'])

In [70]:
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Cabin'] = df['Cabin'].str.extract('([A-Z])')
df['Cabin'].fillna('missing', inplace=True)
df = df.drop('PassengerId', axis=1)
df = df.drop(['Ticket', 'Embarked'], axis=1)

In [71]:
df['Sex'] = (df['Sex'] == 'female').astype(int)

In [72]:
df = pd.get_dummies(df, columns=['Cabin'])

In [73]:
df['Age_group'] = pd.cut(df.Age, [-1,12,50,80])
df = pd.get_dummies(df, columns=['Age_group'])

In [92]:
df.groupby('Pclass')['Fare'].mean()
df['Fare_group'] = pd.cut(df.Fare, [-1, 12, 32, 600])
df.Fare_group.unique()

[(-1, 12], (32, 600], (12, 32]]
Categories (3, interval[int64]): [(-1, 12] < (12, 32] < (32, 600]]

In [93]:
df = pd.get_dummies(df, columns=['Fare_group'])

In [94]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 26 columns):
Survived                891 non-null int64
Pclass                  891 non-null int64
Name                    891 non-null object
Sex                     891 non-null int32
Age                     891 non-null float64
SibSp                   891 non-null int64
Parch                   891 non-null int64
Fare                    891 non-null float64
Cabin_A                 891 non-null uint8
Cabin_B                 891 non-null uint8
Cabin_C                 891 non-null uint8
Cabin_D                 891 non-null uint8
Cabin_E                 891 non-null uint8
Cabin_F                 891 non-null uint8
Cabin_G                 891 non-null uint8
Cabin_T                 891 non-null uint8
Cabin_missing           891 non-null uint8
Age_group_(-1, 12]      891 non-null uint8
Age_group_(12, 50]      891 non-null uint8
Age_group_(50, 80]      891 non-null uint8
Fare_group_(-1, 12]     891 no

In [75]:
df.head(3)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_missing,"Age_group_(-1, 12]","Age_group_(12, 50]","Age_group_(50, 80]"
0,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,7.25,0,0,0,0,0,0,0,0,1,0,1,0
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,71.2833,0,0,1,0,0,0,0,0,0,0,1,0
2,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,7.925,0,0,0,0,0,0,0,0,1,0,1,0


In [97]:
from sklearn import model_selection 
from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import RandomForestClassifier 

predictors = ['Pclass', 'Sex', 'SibSp', 'Parch',
             'Cabin_A','Cabin_B','Cabin_C','Cabin_D','Cabin_E',
              'Cabin_F','Cabin_G','Cabin_T','Cabin_missing',
             'Age_group_(-1, 12]', 'Age_group_(12, 50]','Age_group_(50, 80]',
            'Fare_group_(-1, 12]', 'Fare_group_(12, 32]','Fare_group_(32, 600]']
lgr = LogisticRegression(random_state=1)
lgr.fit(df[predictors], df['Survived'])
scores = model_selection.cross_val_score(lgr, df[predictors], df['Survived'])
print('Logistic:', scores)

alg = RandomForestClassifier(random_state=1, n_estimators=10, 
                             min_samples_split=2, min_samples_leaf=2)
scores = model_selection.cross_val_score(alg, df[predictors], df['Survived'])
print('random forest:', scores)

Logistic: [0.77777778 0.79124579 0.7979798 ]
random forest: [0.76430976 0.82828283 0.82491582]


In [None]:
df_test = pd.read_csv()