In [1]:
import pandas as pd
import numpy as np

import os

def load_hotel_reserve():
  customer_tb = pd.read_csv('../awesomebook-master/data/customer.csv')
  hotel_tb = pd.read_csv('../awesomebook-master/data/hotel.csv')
  reserve_tb = pd.read_csv('../awesomebook-master/data/reserve.csv')
  return customer_tb, hotel_tb, reserve_tb


def load_holiday_mst():
  holiday_tb = pd.read_csv('../awesomebook-master/data/holiday_mst.csv',
                           index_col=False)
  return holiday_tb


def load_production():
  production_tb = pd.read_csv('../awesomebook-master/data/production.csv')
  return production_tb


def load_production_missing_num():
  production_tb = pd.read_csv('../awesomebook-master/data/production_missing_num.csv')
  return production_tb


def load_production_missing_category():
  production_tb = pd.read_csv('../awesomebook-master/data/production_missing_category.csv')
  return production_tb


def load_monthly_index():
  monthly_index_tb = \
    pd.read_csv('../awesomebook-master/data/monthly_index.csv')
  return monthly_index_tb


def load_meros_txt():
  with open('../awesomebook-master/data/txt/meros.txt', 'r') as f:
    meros = f.read()
    f.close()
  return meros


In [2]:
customer_tb, hotel_tb, reserve_tb = load_hotel_reserve()

In [None]:
customer_tb[['sex_is_man']] = (customer_tb[['sex']] == 'man').astype('bool')

customer_tb['sex_c'] = pd.Categorical(customer_tb['sex'], categories=['man', 'woman'])

print(customer_tb['sex_c'].cat.codes)

print(customer_tb['sex_c'].cat.categories)

0      0
1      0
2      1
3      0
4      0
      ..
995    0
996    0
997    1
998    1
999    0
Length: 1000, dtype: int8 Index(['man', 'woman'], dtype='object')


In [5]:
customer_tb['sex'] = pd.Categorical(customer_tb['sex'])

dummy_variables = pd.get_dummies(customer_tb['sex'], drop_first=False)

dummy_variables

Unnamed: 0,man,woman
0,True,False
1,True,False
2,False,True
3,True,False
4,True,False
...,...,...
995,True,False
996,True,False
997,False,True
998,False,True


In [8]:
customer_tb['age_rank'] = pd.Categorical(np.floor(customer_tb['age'] / 10) * 10)

customer_tb['age_rank'] = customer_tb['age_rank'].cat.add_categories(['60 이상'])

customer_tb.loc[customer_tb['age_rank'].isin([60.0, 70.0, 80.0]), 'age_rank'] = '60 이상'

# customer_tb['age_rank'] = customer_tb['age_rank'].cat.remove_unused_categories()

customer_tb[:40]

Unnamed: 0,customer_id,age,sex,home_latitude,home_longitude,sex_is_man,sex_c,age_rank
0,c_1,41,man,35.092193,136.512347,True,man,40.0
1,c_2,38,man,35.325076,139.410551,True,man,30.0
2,c_3,49,woman,35.120543,136.511179,False,woman,40.0
3,c_4,43,man,43.034868,141.240314,True,man,40.0
4,c_5,31,man,35.102661,136.523797,True,man,30.0
5,c_6,52,man,34.440768,135.390487,True,man,50.0
6,c_7,50,man,43.015758,141.231321,True,man,50.0
7,c_8,65,woman,38.201268,140.465961,False,woman,60 이상
8,c_9,36,woman,33.3228,130.330689,False,woman,30.0
9,c_10,34,woman,34.290414,132.302601,False,woman,30.0


In [22]:
production_missc_tb = load_production_missing_category()

from sklearn.neighbors import KNeighborsClassifier

production_missc_tb.replace('None', np.nan, inplace=True)

train = production_missc_tb.dropna(subset=['type'], inplace=False)

test = production_missc_tb.loc[production_missc_tb.index.difference(train.index), :]
print(test)

kn = KNeighborsClassifier(n_neighbors=3)

kn.fit(train[['length', 'thickness']], train['type'])

test['type'] = kn.predict(test[['length', 'thickness']])

print(test)

    type      length  thickness  fault_flg
8    NaN  276.386631  29.899611      False
26   NaN  263.844324  34.664251      False
30   NaN  129.364736  21.346752      False
36   NaN  203.378972  30.286454      False
41   NaN  157.463166  11.166165      False
..   ...         ...        ...        ...
971  NaN  130.088061   0.207250      False
980  NaN  284.562824  49.211790      False
983  NaN  264.130761   4.560416      False
992  NaN  182.252364  33.314305      False
996  NaN  134.773797  26.861665      False

[100 rows x 4 columns]
    type      length  thickness  fault_flg
8      E  276.386631  29.899611      False
26     E  263.844324  34.664251      False
30     E  129.364736  21.346752      False
36     A  203.378972  30.286454      False
41     E  157.463166  11.166165      False
..   ...         ...        ...        ...
971    A  130.088061   0.207250      False
980    E  284.562824  49.211790      False
983    B  264.130761   4.560416      False
992    A  182.252364  33.31430