In [1]:
import pandas as pd
import numpy as np

import os

def load_hotel_reserve():
  customer_tb = pd.read_csv('../awesomebook-master/data/customer.csv')
  hotel_tb = pd.read_csv('../awesomebook-master/data/hotel.csv')
  reserve_tb = pd.read_csv('../awesomebook-master/data/reserve.csv')
  return customer_tb, hotel_tb, reserve_tb


def load_holiday_mst():
  holiday_tb = pd.read_csv('../awesomebook-master/data/holiday_mst.csv',
                           index_col=False)
  return holiday_tb


def load_production():
  production_tb = pd.read_csv('../awesomebook-master/data/production.csv')
  return production_tb


def load_production_missing_num():
  production_tb = pd.read_csv('../awesomebook-master/data/production_missing_num.csv')
  return production_tb


def load_production_missing_category():
  production_tb = pd.read_csv('../awesomebook-master/data/production_missing_category.csv')
  return production_tb


def load_monthly_index():
  monthly_index_tb = \
    pd.read_csv('../awesomebook-master/data/monthly_index.csv')
  return monthly_index_tb


def load_meros_txt():
  with open('../awesomebook-master/data/txt/meros.txt', 'r') as f:
    meros = f.read()
    f.close()
  return meros


In [2]:
customer_tb, hotel_tb, reserve_tb = load_hotel_reserve()

reserve_tb

Unnamed: 0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date,people_num,total_price
0,r1,h_75,c_1,2016-03-06 13:09:42,2016-03-26,10:00:00,2016-03-29,4,97200
1,r2,h_219,c_1,2016-07-16 23:39:55,2016-07-20,11:30:00,2016-07-21,2,20600
2,r3,h_179,c_1,2016-09-24 10:03:17,2016-10-19,09:00:00,2016-10-22,2,33600
3,r4,h_214,c_1,2017-03-08 03:20:10,2017-03-29,11:00:00,2017-03-30,4,194400
4,r5,h_16,c_1,2017-09-05 19:50:37,2017-09-22,10:30:00,2017-09-23,3,68100
...,...,...,...,...,...,...,...,...,...
4025,r4026,h_129,c_999,2017-06-27 23:00:02,2017-07-10,09:30:00,2017-07-11,2,16000
4026,r4027,h_97,c_999,2017-09-29 05:24:57,2017-10-09,10:30:00,2017-10-10,2,41800
4027,r4028,h_27,c_999,2018-03-14 05:01:45,2018-04-02,11:30:00,2018-04-04,2,74800
4028,r4029,h_48,c_1000,2016-04-16 15:20:17,2016-05-10,09:30:00,2016-05-13,4,540000


In [3]:
reserve_tb['total_price_log'] = reserve_tb['total_price'].apply(lambda x: np.log10(x / 1000 + 1))

reserve_tb['total_price_log']

0       1.992111
1       1.334454
2       1.539076
3       2.290925
4       1.839478
          ...   
4025    1.230449
4026    1.631444
4027    1.879669
4028    2.733197
4029    1.654177
Name: total_price_log, Length: 4030, dtype: float64

In [4]:
customer_tb['age_rank'] = (np.floor(customer_tb['age'] / 10) * 10).astype('category')

customer_tb

Unnamed: 0,customer_id,age,sex,home_latitude,home_longitude,age_rank
0,c_1,41,man,35.092193,136.512347,40.0
1,c_2,38,man,35.325076,139.410551,30.0
2,c_3,49,woman,35.120543,136.511179,40.0
3,c_4,43,man,43.034868,141.240314,40.0
4,c_5,31,man,35.102661,136.523797,30.0
...,...,...,...,...,...,...
995,c_996,44,man,34.465648,135.373787,40.0
996,c_997,35,man,35.345372,139.413754,30.0
997,c_998,32,woman,43.062267,141.272126,30.0
998,c_999,48,woman,38.172800,140.464198,40.0


In [5]:
from sklearn.preprocessing import StandardScaler

reserve_tb['people_num'] = reserve_tb['people_num'].astype(float)

standard_scaler = StandardScaler()

result = standard_scaler.fit_transform(reserve_tb[['people_num', 'total_price']])

reserve_tb['people_num_normalized'] = [x[0] for x in result]
reserve_tb['total_price_normalized'] = [x[1] for x in result]

reserve_tb

Unnamed: 0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date,people_num,total_price,total_price_log,people_num_normalized,total_price_normalized
0,r1,h_75,c_1,2016-03-06 13:09:42,2016-03-26,10:00:00,2016-03-29,4.0,97200,1.992111,1.300709,-0.053194
1,r2,h_219,c_1,2016-07-16 23:39:55,2016-07-20,11:30:00,2016-07-21,2.0,20600,1.334454,-0.483753,-0.747822
2,r3,h_179,c_1,2016-09-24 10:03:17,2016-10-19,09:00:00,2016-10-22,2.0,33600,1.539076,-0.483753,-0.629935
3,r4,h_214,c_1,2017-03-08 03:20:10,2017-03-29,11:00:00,2017-03-30,4.0,194400,2.290925,1.300709,0.828240
4,r5,h_16,c_1,2017-09-05 19:50:37,2017-09-22,10:30:00,2017-09-23,3.0,68100,1.839478,0.408478,-0.317080
...,...,...,...,...,...,...,...,...,...,...,...,...
4025,r4026,h_129,c_999,2017-06-27 23:00:02,2017-07-10,09:30:00,2017-07-11,2.0,16000,1.230449,-0.483753,-0.789536
4026,r4027,h_97,c_999,2017-09-29 05:24:57,2017-10-09,10:30:00,2017-10-10,2.0,41800,1.631444,-0.483753,-0.555575
4027,r4028,h_27,c_999,2018-03-14 05:01:45,2018-04-02,11:30:00,2018-04-04,2.0,74800,1.879669,-0.483753,-0.256323
4028,r4029,h_48,c_1000,2016-04-16 15:20:17,2016-05-10,09:30:00,2016-05-13,4.0,540000,2.733197,1.300709,3.962229


In [6]:
reserve_tb = reserve_tb[
    (abs(reserve_tb['total_price'] - np.mean(reserve_tb['total_price'])) / np.std(reserve_tb['total_price']) <= 3)
].reset_index()

reserve_tb

Unnamed: 0,index,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date,people_num,total_price,total_price_log,people_num_normalized,total_price_normalized
0,0,r1,h_75,c_1,2016-03-06 13:09:42,2016-03-26,10:00:00,2016-03-29,4.0,97200,1.992111,1.300709,-0.053194
1,1,r2,h_219,c_1,2016-07-16 23:39:55,2016-07-20,11:30:00,2016-07-21,2.0,20600,1.334454,-0.483753,-0.747822
2,2,r3,h_179,c_1,2016-09-24 10:03:17,2016-10-19,09:00:00,2016-10-22,2.0,33600,1.539076,-0.483753,-0.629935
3,3,r4,h_214,c_1,2017-03-08 03:20:10,2017-03-29,11:00:00,2017-03-30,4.0,194400,2.290925,1.300709,0.828240
4,4,r5,h_16,c_1,2017-09-05 19:50:37,2017-09-22,10:30:00,2017-09-23,3.0,68100,1.839478,0.408478,-0.317080
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3927,4024,r4025,h_160,c_999,2017-03-11 11:56:05,2017-03-27,10:00:00,2017-03-30,1.0,37200,1.582063,-1.375984,-0.597289
3928,4025,r4026,h_129,c_999,2017-06-27 23:00:02,2017-07-10,09:30:00,2017-07-11,2.0,16000,1.230449,-0.483753,-0.789536
3929,4026,r4027,h_97,c_999,2017-09-29 05:24:57,2017-10-09,10:30:00,2017-10-10,2.0,41800,1.631444,-0.483753,-0.555575
3930,4027,r4028,h_27,c_999,2018-03-14 05:01:45,2018-04-02,11:30:00,2018-04-04,2.0,74800,1.879669,-0.483753,-0.256323


In [7]:
production_tb = load_production()

from sklearn.decomposition import PCA

pca = PCA(n_components=2)

pca_values = pca.fit_transform(production_tb[['length', 'thickness']])

print(f'누적 기여율: {sum(pca.explained_variance_ratio_)}')
print(f'각 차원의 기여율: {pca.explained_variance_ratio_}')

pca_new_values = pca.transform(production_tb[['length', 'thickness']])

pca_new_values

누적 기여율: 1.0
각 차원의 기여율: [0.97897794 0.02102206]


array([[  76.96838157,   13.38906936],
       [-112.11469337,    8.24884796],
       [ -76.1994339 ,  -11.19027127],
       ...,
       [  31.12100559,  -15.48152593],
       [-117.87675543,   -2.4361334 ],
       [   4.80243541,   15.32174872]], shape=(1000, 2))

In [8]:
production_miss_num = load_production_missing_num()

production_miss_num.replace('None', np.nan, inplace=True)

print(production_miss_num)

production_miss_num.dropna(subset=['thickness'], inplace=True)

print(production_miss_num)

    type      length  thickness  fault_flg
0      E  274.027383  40.241131      False
1      D   86.319269  16.906715      False
2      E  123.940388   1.018462      False
3      B  175.554886  16.414924      False
4      B  244.934740  29.061081      False
..   ...         ...        ...        ...
995    C  363.214163  48.369483      False
996    D  134.773797  26.861665      False
997    B  231.174985   7.087471      False
998    D   81.613510   5.716271      False
999    C  202.010973  35.211647       True

[1000 rows x 4 columns]
    type      length  thickness  fault_flg
0      E  274.027383  40.241131      False
1      D   86.319269  16.906715      False
2      E  123.940388   1.018462      False
3      B  175.554886  16.414924      False
4      B  244.934740  29.061081      False
..   ...         ...        ...        ...
995    C  363.214163  48.369483      False
996    D  134.773797  26.861665      False
997    B  231.174985   7.087471      False
998    D   81.613510   5.7162

In [9]:
production_miss_num.replace('None', np.nan, inplace=True)

production_miss_num['thickness'].fillna(1, inplace=True)

production_miss_num

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  production_miss_num['thickness'].fillna(1, inplace=True)


Unnamed: 0,type,length,thickness,fault_flg
0,E,274.027383,40.241131,False
1,D,86.319269,16.906715,False
2,E,123.940388,1.018462,False
3,B,175.554886,16.414924,False
4,B,244.934740,29.061081,False
...,...,...,...,...
995,C,363.214163,48.369483,False
996,D,134.773797,26.861665,False
997,B,231.174985,7.087471,False
998,D,81.613510,5.716271,False


In [10]:
production_miss_num.replace('None', np.nan, inplace=True)

production_miss_num['thickness'] = production_miss_num['thickness'].astype('float64')

thickness_mean = production_miss_num['thickness'].mean()

production_miss_num['thickness'].fillna(thickness_mean, inplace=True)

production_miss_num


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  production_miss_num['thickness'].fillna(thickness_mean, inplace=True)


Unnamed: 0,type,length,thickness,fault_flg
0,E,274.027383,40.241131,False
1,D,86.319269,16.906715,False
2,E,123.940388,1.018462,False
3,B,175.554886,16.414924,False
4,B,244.934740,29.061081,False
...,...,...,...,...
995,C,363.214163,48.369483,False
996,D,134.773797,26.861665,False
997,B,231.174985,7.087471,False
998,D,81.613510,5.716271,False
