## 1. Connect to the database

In [71]:
from numpy import isnan
from sklearn.impute import SimpleImputer
import pymysql.cursors
import pandas as pd
import numpy as np

connection = pymysql.connect(host='localhost',
        user='root',
        password='1234',
        db='tip',
        charset='utf8',
        cursorclass=pymysql.cursors.DictCursor)

try:
    with connection.cursor() as cursor:
        # Read a single record (sql)
        sql = "SELECT * FROM tips;"
        cursor.execute(sql) # cursor 실행
        result = cursor.fetchall()
        df = pd.DataFrame(result) # dateframe에 result 저장
finally:
    connection.close()

---  
## 2. 결측치 확인  

In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  250 non-null    float64
 1   tip         248 non-null    float64
 2   sex         250 non-null    object 
 3   smoker      250 non-null    object 
 4   day         250 non-null    object 
 5   time        249 non-null    object 
 6   size        248 non-null    float64
dtypes: float64(3), object(4)
memory usage: 13.8+ KB


In [73]:
df.isnull().sum()

total_bill    0
tip           2
sex           0
smoker        0
day           0
time          1
size          2
dtype: int64

## 3. Encoding (범주형Data -> 연속형Data)

In [74]:
df['sex'].replace({'Female':0,'Male':1}, inplace=True)
df['smoker'].replace({'No':0,'Yes':1},inplace=True)
df['day'].replace({'Thur':0,'Fri':1,'Sat':2,'Sun':3},inplace=True)
df['time'].replace({'Lunch':0, 'Dinner':1},inplace=True)

In [75]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  250 non-null    float64
 1   tip         248 non-null    float64
 2   sex         250 non-null    int64  
 3   smoker      250 non-null    int64  
 4   day         250 non-null    object 
 5   time        249 non-null    float64
 6   size        248 non-null    float64
dtypes: float64(4), int64(2), object(1)
memory usage: 13.8+ KB


---  
## 4. 결측치 처리(median)  

In [77]:
# '' => np.nan으로 변경
df.replace('',np.nan,inplace=True)

In [80]:
# SimpleImputer클래스로 median값으로 변경
imputer = SimpleImputer(strategy='median')

imputer.fit(df)

data_trans = imputer.transform(df)

data_trans = pd.DataFrame(data_xtrans,columns=df.columns)

In [81]:
data_trans.isnull().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [87]:
data_trans.shape

(250, 7)

## 5.독립변수와 목표변수 설정

In [14]:
df.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

In [82]:
x = data_trans.drop('total_bill',axis=1) # 독립변수 
y = data_trans['total_bill'] # 목표변수 => 'total_bill'

In [83]:
data= x.values  

In [90]:
y

0      16.99
1      10.34
2      21.01
3      23.68
4      24.59
       ...  
245    20.65
246    16.99
247    16.99
248    16.99
249    16.99
Name: total_bill, Length: 250, dtype: float64

In [91]:
data

array([[1.01, 0.  , 0.  , 3.  , 1.  , 2.  ],
       [1.66, 1.  , 0.  , 3.  , 1.  , 3.  ],
       [3.5 , 1.  , 0.  , 3.  , 1.  , 3.  ],
       ...,
       [1.01, 0.  , 0.  , 3.  , 1.  , 2.  ],
       [2.9 , 0.  , 0.  , 2.  , 1.  , 2.  ],
       [2.9 , 0.  , 0.  , 2.  , 1.  , 2.  ]])

--- 
## 6. 표준화  

   * ### 표준화 과정
        -  데이터를 입력으로 하여 fit 메서드를 실행하면 분포 모수를 객체 내에 저장하고 데이터
            를 입력으로 하여 transform 메서드를 실행하면 데이터를 변환  
            
        - 2개 과정을 합쳐서 fit_transform 메서드를 사용할 수 있음
   * ### 주의점
        - 훈련 세트와 테스트 세트로 데이터가 나누어져 있는 경우 표준화는 동일한 데이터를 가
            지고 수행해야 함  

****

* ### MinMaxScaler 
    - MinMaxScaler는 특정값에 집중되어 있는 데이터가 그렇지 않은 데이터 분포보다 표준
        편차에 의한 스케일 변화값이 커지게 되며 한쪽으로 쏠림 현상이 있는 데이터 분포는
        형태가 거의 유지된 채 범위값이 조절
    - 최대값이 각각 1, 최소값이 0이 되도록 변환
    - <u>**신경망**</u>에서 많이 이용

In [146]:
from sklearn.preprocessing import MinMaxScaler

trans = MinMaxScaler()

df_norm = pd.DataFrame(trans.fit_transform(data),columns=x.columns,index = x.index)

df_norm.head()

Unnamed: 0,tip,sex,smoker,day,time,size
0,0.001111,0.0,0.0,1.0,1.0,0.2
1,0.073333,1.0,0.0,1.0,1.0,0.4
2,0.277778,1.0,0.0,1.0,1.0,0.4
3,0.256667,1.0,0.0,1.0,1.0,0.2
4,0.29,0.0,0.0,1.0,1.0,0.6


In [147]:
df_norm.describe()

Unnamed: 0,tip,sex,smoker,day,time,size
count,250.0,250.0,250.0,250.0,250.0,250.0
mean,0.220489,0.636,0.372,0.58,0.728,0.316
std,0.153195,0.482114,0.484308,0.382633,0.445883,0.191758
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.111111,0.0,0.0,0.333333,0.0,0.2
50%,0.211111,1.0,0.0,0.666667,1.0,0.2
75%,0.278611,1.0,1.0,1.0,1.0,0.4
max,1.0,1.0,1.0,1.0,1.0,1.0


* ### StandardScaler 
    - StandardScaler와 RobustScaler의 변환된 결과가 대부분 표준화된 유사 형태의 데이터
        분포로 반환 
    - 평균이 0과 표준편차가 1이 되도록 변환
    - <u>**주성분 분석**</u>에서 많이 이용

In [110]:
from sklearn.preprocessing import StandardScaler

In [111]:
sc = StandardScaler()
df_sc = sc.fit_transform(data)
df_transform_sc = pd.DataFrame(df_sc, columns=x.columns)
df_transform_sc.head().round()

Unnamed: 0,tip,sex,smoker,day,time,size
0,-1.0,-1.0,-1.0,1.0,1.0,-1.0
1,-1.0,1.0,-1.0,1.0,1.0,0.0
2,0.0,1.0,-1.0,1.0,1.0,0.0
3,0.0,1.0,-1.0,1.0,1.0,-1.0
4,0.0,-1.0,-1.0,1.0,1.0,1.0


In [112]:
df_transform_sc.describe().round()

Unnamed: 0,tip,sex,smoker,day,time,size
count,250.0,250.0,250.0,250.0,250.0,250.0
mean,0.0,-0.0,-0.0,0.0,0.0,-0.0
std,1.0,1.0,1.0,1.0,1.0,1.0
min,-1.0,-1.0,-1.0,-2.0,-2.0,-2.0
25%,-1.0,-1.0,-1.0,-1.0,-2.0,-1.0
50%,-0.0,1.0,-1.0,0.0,1.0,-1.0
75%,0.0,1.0,1.0,1.0,1.0,0.0
max,5.0,1.0,1.0,1.0,1.0,4.0


---  
## 7. Feature Selection

   ### - 독립변수들간의 상관관계로 주성분 추출


* ### PCA
     공분산값을 구해서 아이겐벨류값을 얻어서 그걸 비교해서 위에서 부터 큰값을 리턴한다

In [143]:
from sklearn.decomposition import PCA

# define the transform
trans = PCA(n_components=4)

# transform the data
x_dim = trans.fit_transform(data)

# summarize data after the transform
print(x_dim[:3,:])

[[-1.54950666 -1.83178143  0.2595805  -0.64413868]
 [-0.5124604  -1.64280136  0.87497557 -0.06399968]
 [ 1.04143102 -1.04643997  0.10007426 -0.1831451 ]]


  
### - 독립변수와 목표변수의 상관관계로 주성분 추출
* ### RFE (anova)

In [148]:
from sklearn.feature_selection import RFE
from sklearn.svm import SVR

estimator = SVR(kernel="linear")

# define RFE
rfe = RFE(estimator, n_features_to_select=4)

selector = rfe.fit(data,y)

selector.support_

array([ True, False,  True, False,  True,  True])

In [117]:
RFE_featurn_4 = [i[1] for i in list(zip(selector.support_,x.columns)) if i[0] ]
RFE_featurn_4

['tip', 'smoker', 'time', 'size']

* ### Regression Feature Selection (Pearson's)

In [149]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

fs = SelectKBest(score_func=f_regression, k=4)
data.shape # column 6개

(250, 6)

In [118]:
X_selected = fs.fit_transform(data, y)
fs.get_support()

array([ True, False, False,  True,  True,  True])

In [119]:
RFS_featurn_4 = [i[1] for i in list(zip(fs.get_support(),x.columns)) if i[0] ]
RFS_featurn_4

['tip', 'day', 'time', 'size']

---
