In [2]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [50]:
#数据初步查看
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
all_data = pd.concat((train_df, test_df)).reset_index(drop=True)
#all_data.head(5)
#all_data.tail(5)
#all_data.info()
all_data.columns
#all_data['Survived'].describe
#all_data.describe()


Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [51]:
#数据关联性的数据直接查看
var_x='Pclass'
var_y='Survived'
train_df[[var_x, var_y]].groupby([var_x], as_index=False).mean().sort_values(by=var_y, ascending=False)

#查看缺失值
total = all_data.isnull().sum().sort_values(ascending=False)
percent = (all_data.isnull().sum()/all_data.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

Unnamed: 0,Total,Percent
Cabin,1014,0.774637
Survived,418,0.319328
Age,263,0.200917
Embarked,2,0.001528
Fare,1,0.000764
PassengerId,0,0.0
Pclass,0,0.0
Name,0,0.0
Sex,0,0.0
SibSp,0,0.0


In [45]:
#作图查看
var_x ='Pclass'
var_y='Survived'

# 按年龄分组，计算存活率------柱状图
# groups = train_df.groupby(var_x)[var_y].mean()
# plt.figure(figsize=(10, 6))
# plt.bar(groups.index, groups.values)
# plt.xlabel(var_x)
# plt.ylabel(var_y)
# plt.show()

#scatter plot grlivarea/saleprice
# var = 'GrLivArea'
# data = pd.concat([train_df[var_y], train_df[var_x]], axis=1)
# data.plot.scatter(x=var_x, y=var_y, ylim=(0,train_df[var_y].max()));

In [52]:
#数据处理

#丢弃特征
drop_feature=['Ticket', 'Cabin','Name', 'PassengerId']
print(all_data.shape)
all_data.drop(drop_feature, axis = 1, inplace = True)
print(all_data.shape)

#丢弃某些点
#all_data = all_data.drop(all_data[(all_data['GrLivArea']>4000) & (all_data['SalePrice']<300000)].index)

#缺失值填充
all_data['Embarked'] = all_data['Embarked'].fillna(all_data['Embarked'].dropna().mode()[0])#用众数填充
#all_data["Age"] = all_data.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))#用某组分组后均值填充
all_data['Age']=all_data['Age'].fillna(all_data['Age'].dropna().mean())#均值填充
all_data['Fare']=all_data['Fare'].fillna(all_data['Fare'].dropna().mean())
#将类别数据转化为数值数据
all_data['Sex'] = all_data['Sex'].map( {'female': 1, 'male': 0} ).astype(int)
all_data['Embarked'] = all_data['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

#将数值数据查看转化为区间，并转化为类型数据
train_df['AgeBand'] = pd.cut(train_df['Age'], 5)
train_df[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean().sort_values(by='AgeBand', ascending=True)
all_data.loc[ all_data['Age'] <= 16, 'Age'] = 0
all_data.loc[(all_data['Age'] > 16) & (all_data['Age'] <= 32), 'Age'] = 1
all_data.loc[(all_data['Age'] > 32) & (all_data['Age'] <= 48), 'Age'] = 2
all_data.loc[(all_data['Age'] > 48) & (all_data['Age'] <= 64), 'Age'] = 3
all_data.loc[ all_data['Age'] > 64, 'Age']=4

#创建新的特征
all_data['IsAlone'] = 0
all_data['FamilySize'] =all_data['SibSp'] + all_data['Parch'] + 1
all_data.loc[all_data['FamilySize'] == 1, 'IsAlone'] = 1

drop_feature=['Parch', 'SibSp']
all_data.drop(drop_feature, axis = 1, inplace = True)

all_data.head(5)

(1309, 12)
(1309, 8)


Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,IsAlone,FamilySize
0,0.0,3,0,1.0,7.25,0,0,2
1,1.0,1,1,2.0,71.2833,1,0,2
2,1.0,3,1,1.0,7.925,0,1,1
3,1.0,1,1,2.0,53.1,0,0,2
4,0.0,3,0,2.0,8.05,0,1,1


In [63]:
train = all_data[:len(train_df)]
test = all_data[len(train_df):]
X_train = train.drop("Survived", axis=1)
Y_train = train_df["Survived"]
X_test  = test.drop("Survived", axis=1).copy()