In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import pandas_profiling as ppf  #eda
from sklearn.preprocessing import LabelEncoder #标签编码
from sklearn.preprocessing import MinMaxScaler #归一化
from sklearn.model_selection import train_test_split #数据集的划分
from sklearn.linear_model import LinearRegression #算法
from sklearn.metrics import mean_absolute_error #评估函数

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [3]:
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')

In [4]:
train.info()
#5 age: has null data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [6]:
#ppf.ProfileReport(train)

## 数据处理

### 1. 缺失值处理：均值填空

In [7]:
train['Age'] = train['Age'].fillna(np.mean(train['Age']))
test['Age'] = test['Age'].fillna(np.mean(test['Age']))

test['Fare'] = test['Fare'].fillna(np.mean(test['Fare']))

In [8]:
train['Age'].isnull().sum()

0

### 2.将字符串做成onehot数值型

In [9]:
# onehot 编码
train_dummy = pd.get_dummies(train[['Sex', 'Embarked']])
test_dummy = pd.get_dummies(test[['Sex', 'Embarked']])
train_dummy[:2]

Unnamed: 0,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,1,0,0,1
1,1,0,1,0,0


In [10]:
train = train.drop(labels=['Embarked', 'Sex'], axis=1)
test = test.drop(labels=['Embarked', 'Sex'], axis=1)
train = train.join(train_dummy)
test = test.join(test_dummy)

### 3.删除没必要的列

In [11]:
test_PassengerId = test['PassengerId']
train = train.drop(labels=['PassengerId', 'Name', 'Ticket', 'Cabin'], axis = 1)
test = test.drop(labels=['PassengerId', 'Name', 'Ticket', 'Cabin'], axis = 1)

In [12]:
train.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,7.25,0,1,0,0,1
1,1,1,38.0,1,0,71.2833,1,0,1,0,0
2,1,3,26.0,0,0,7.925,1,0,0,0,1
3,1,1,35.0,1,0,53.1,1,0,0,0,1
4,0,3,35.0,0,0,8.05,0,1,0,0,1


### 4.对数据归一化处理

In [13]:
#minmax = MinMaxScaler()
stdsc = StandardScaler()
# train['Age'] = minmax.fit_transform(train['Age'])报错
train['Age'] = stdsc.fit_transform(np.array(train['Age']).reshape(-1, 1))
test['Age'] = stdsc.fit_transform(np.array(test['Age']).reshape(-1, 1))
train['Fare'] = stdsc.fit_transform(np.array(train['Fare']).reshape(-1, 1))
test['Fare'] = stdsc.fit_transform(np.array(test['Fare']).reshape(-1, 1))

In [14]:
# stdsc = StandardScaler()
# train_conti_std = stdsc.fit_transform(train[['Age', 'SibSp', 'Parch', 'Fare']])
# test_conti_std = stdsc.fit_transform(test[['Age', 'SibSp', 'Parch', 'Fare']])
# # turn ndarray into dataframe
# train_conti_std = pd.DataFrame(data=train_conti_std, columns=['Age', 'SibSp', 'Parch', 'Fare'],\
#                                index=train.index) #note train.index
# test_conti_std = pd.DataFrame(data=test_conti_std, columns=['Age', 'SibSp', 'Parch', 'Fare'],\
#                               index=test.index) #note train.index

## 数据建模
### 1.划分数据集

In [15]:
x = train.drop('Survived', axis=1)
y = train['Survived']
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=0)

In [16]:
x_train.shape, y_train.shape

((712, 10), (712,))

### 2.搭建模型，训练

In [17]:
# linear = LinearRegression()
# linear.fit(x_train, y_train)

# y_pred = linear.predict(x_test)
# mae = mean_absolute_error(y_pred, y_test)
# mae

In [18]:
classifier = LogisticRegression(random_state=0)
classifier.fit(x_train, y_train)

y_pred = classifier.predict(x_test)
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

[[93 17]
 [18 51]]




### 3.提交

In [20]:
predict = classifier.predict(test)

In [21]:
submission = pd.DataFrame({'PassengerId':test_PassengerId, 'Survived': predict})

In [22]:
submission.to_csv('submission.csv', index=False)

In [23]:
pd.read_csv('submission.csv')

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
