-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathUntitled-1.py
93 lines (68 loc) · 3.23 KB
/
Untitled-1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import re
path = './tdd.csv'
data = pd.read_csv(path)
df = pd.DataFrame(data)
df.drop([ 'Trip ID', 'Destination','Start date','Traveler nationality', 'End date', 'Traveler name', 'Transportation type', 'Transportation cost'], axis=1,inplace=True) #, 'Transportation cost'
#df.rename(['Traveler_gender','Traveler_age','Accommodation_type', 'Accommodation_cost'], inplace=True)
df.dropna(inplace=True)
#df.rename(columns={'Duration (days)': 'Duration'}, inplace=True)
df.columns=['Duration', 'Traveler_age', 'Traveler_gender', 'Accommodation_type', 'Accommodation_cost']
df['Traveler_ages'] = df['Traveler_age'].agg(lambda x: x - x%10).map(int)
df['Accommodation_cost'] = df['Accommodation_cost'].agg(lambda x: re.sub('[^0-9]', '', x)).map(int)
sns.set_style('whitegrid')
#성별과 나이에 따른 평균 여행기간(일)
grouped1 = df.groupby(['Traveler_gender', 'Traveler_ages'])
dumean = grouped1['Duration'].mean()
#성별 간 숙박업소에 대한 파이차트와 비용에 대한 막대그래프
grouped2 = df.groupby(['Traveler_gender', 'Accommodation_type'])
#minmax scale
acc_type = grouped2.mean().Accommodation_cost.to_frame()
def minmax(x):
return (x - x.min())/(x.max() - x.min())
f_scale = acc_type.loc["Female"].transform(minmax)
m_scale = acc_type.loc['Male'].transform(minmax)
#grouped3 = df.groupby('Traveler_gender').get_group('Male')
#g3_mean = grouped3.groupby(('Accommodation_type')).mean()
############### 그래프 2가지 이상
fig = plt.figure(figsize=(13,8))
#-> 나이에 따른 여행기간의 바이올린그래프
ax1 = fig.add_subplot(1, 2, 1)
sns.violinplot(x = 'Traveler_gender', y = 'Traveler_ages', hue='Traveler_gender', data=dumean.to_frame(), ax=ax1)
ax2 = fig.add_subplot(1, 2, 2)
sns.distplot(dumean.to_frame(), ax=ax2)
#파이차트
fig_pie = plt.figure(figsize=(15, 10))
ax3 = fig_pie.add_subplot(1, 2, 1)
f_scale.plot(kind='pie', figsize=(13,7), subplots=True, ax=ax3, title='Accommdation', legend=False)
ax4 = fig_pie.add_subplot(1, 2, 2)
plt.pie(data=m_scale, autopct='%1.1f%%',labels=acc_type.loc['Male'].index, x='Accommodation_cost')
#sns.lmplot(x='Accommodation type', y = 'Accommodation cost', data=m_scale, hue=g3_mean.index)
plt.axis('equal')
plt.legend()
#회귀분석
ndf = df[['Accommodation_cost', 'Traveler_age', 'Duration' ]]
#g3_mean.plot(kind='scatter', x='Traveler_ages', y='Accommodation_cost', s = 10, figsize=(10, 5))
ndf.plot(kind='scatter', x='Accommodation_cost', y='Traveler_age', s=10, figsize=(10, 5))
X = ndf[['Traveler_age']]
Y = ndf['Duration']
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=10)
#print(len(X_train), len(Y_train)) #95 95
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, Y_train)
y_hat = lr.predict(X)
#mse
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(X, y_hat)
print('mse')
print(mse)
plt.figure(figsize=(10, 5))
ax1 = sns.distplot(Y, hist=False,label='y')
ax2 = sns.distplot(y_hat, hist=False, label='y_hat', ax=ax1)
plt.show()
plt.close()