# 인터넷 신규 가입 여부 예측 실습 예제
* 데이터 로드
    * pd.read_csv('newbie.csv‘)
* 종속 변수 : 'Newbie‘
* 독립 변수 제거 : 'who', 'Country', 'Years on Internet‘
* 결측값 제거 : 
    * Age : 평균
    * Household Income, Sexual Preference, Marital Status : 최빈값으로 채움
* 범주형 독립변수 데이터 타입 "category"로 변환 후 더미 변수로 변환(원핫인코딩 처리)
    * pd.get_dummies() 사용
    * 사용예)
    * import pandas as pd
    * fruit = pd.DataFrame({'name':['apple', 'banana', 'cherry', 'durian', np.nan],
        'color':['red', 'yellow', 'red', 'green', np.nan]})
    * pd.get_dummies(fruit)


In [41]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from library.preprocessing import DataFramePreprocessor

In [2]:
newbie = pd.read_csv('data/newbie.csv')
newbie

Unnamed: 0,who,Newbie,Age,Gender,Household Income,Sexual Preference,Country,Education Attainment,Major Occupation,Marital Status,Years on Internet
0,id74364,0,54.0,Male,$50-74,Gay male,Ontario,Some College,Computer,Other,4-6 yr
1,id84505,0,39.0,Female,Over $100,Heterosexual,Sweden,Professional,Other,Other,1-3 yr
2,id84509,1,49.0,Female,$40-49,Heterosexual,Washington,Some College,Management,Other,Under 6 mo
3,id87028,1,22.0,Female,$40-49,Heterosexual,Florida,Some College,Computer,Married,6-12 mo
4,id76087,0,20.0,Male,$30-39,Bisexual,New Jersey,Some College,Education,Single,1-3 yr
...,...,...,...,...,...,...,...,...,...,...,...
19578,id83400,0,22.0,Male,Over $100,Heterosexual,Texas,Some College,Education,Single,4-6 yr
19579,id72216,0,19.0,Male,,Heterosexual,New Jersey,Some College,Education,Single,4-6 yr
19580,id8654,0,49.0,Female,$50-74,Heterosexual,Missouri,Doctoral,Education,Married,1-3 yr
19581,id84503,1,42.0,Female,$50-74,Heterosexual,Kentucky,Some College,Other,Married,Under 6 mo


In [3]:
newbie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19583 entries, 0 to 19582
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   who                   19583 non-null  object 
 1   Newbie                19583 non-null  int64  
 2   Age                   19022 non-null  float64
 3   Gender                19583 non-null  object 
 4   Household Income      16398 non-null  object 
 5   Sexual Preference     18291 non-null  object 
 6   Country               19583 non-null  object 
 7   Education Attainment  19583 non-null  object 
 8   Major Occupation      19583 non-null  object 
 9   Marital Status        19240 non-null  object 
 10  Years on Internet     19583 non-null  object 
dtypes: float64(1), int64(1), object(9)
memory usage: 1.6+ MB


In [4]:
newbie.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
who,19583.0,19583.0,id74364,1.0,,,,,,,
Newbie,19583.0,,,,0.253383,0.434959,0.0,0.0,0.0,1.0,1.0
Age,19022.0,,,,35.222269,12.642339,5.0,25.0,33.0,44.0,80.0
Gender,19583.0,2.0,Male,13454.0,,,,,,,
Household Income,16398.0,8.0,$50-74,3935.0,,,,,,,
Sexual Preference,18291.0,6.0,Heterosexual,16869.0,,,,,,,
Country,19583.0,142.0,California,2312.0,,,,,,,
Education Attainment,19583.0,9.0,College,6194.0,,,,,,,
Major Occupation,19583.0,5.0,Computer,5921.0,,,,,,,
Marital Status,19240.0,6.0,Married,8742.0,,,,,,,


# 독립 변수 제거

In [5]:
newbie = newbie.drop(['who', 'Country', 'Years on Internet'], axis=1)
newbie

Unnamed: 0,Newbie,Age,Gender,Household Income,Sexual Preference,Education Attainment,Major Occupation,Marital Status
0,0,54.0,Male,$50-74,Gay male,Some College,Computer,Other
1,0,39.0,Female,Over $100,Heterosexual,Professional,Other,Other
2,1,49.0,Female,$40-49,Heterosexual,Some College,Management,Other
3,1,22.0,Female,$40-49,Heterosexual,Some College,Computer,Married
4,0,20.0,Male,$30-39,Bisexual,Some College,Education,Single
...,...,...,...,...,...,...,...,...
19578,0,22.0,Male,Over $100,Heterosexual,Some College,Education,Single
19579,0,19.0,Male,,Heterosexual,Some College,Education,Single
19580,0,49.0,Female,$50-74,Heterosexual,Doctoral,Education,Married
19581,1,42.0,Female,$50-74,Heterosexual,Some College,Other,Married


# 결측값 처리

## Age : 평균

In [6]:
newbie['Age'].fillna(newbie['Age'].mean(), inplace=True)

In [7]:
newbie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19583 entries, 0 to 19582
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Newbie                19583 non-null  int64  
 1   Age                   19583 non-null  float64
 2   Gender                19583 non-null  object 
 3   Household Income      16398 non-null  object 
 4   Sexual Preference     18291 non-null  object 
 5   Education Attainment  19583 non-null  object 
 6   Major Occupation      19583 non-null  object 
 7   Marital Status        19240 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 1.2+ MB


## Household Income, Sexual Preference, Marital Status : 최빈값으로 채움

In [8]:
newbie['Household Income'].fillna(newbie['Household Income'].mode()[0], inplace=True)
newbie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19583 entries, 0 to 19582
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Newbie                19583 non-null  int64  
 1   Age                   19583 non-null  float64
 2   Gender                19583 non-null  object 
 3   Household Income      19583 non-null  object 
 4   Sexual Preference     18291 non-null  object 
 5   Education Attainment  19583 non-null  object 
 6   Major Occupation      19583 non-null  object 
 7   Marital Status        19240 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 1.2+ MB


In [9]:
newbie['Sexual Preference'].fillna(newbie['Sexual Preference'].mode()[0], inplace=True)
newbie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19583 entries, 0 to 19582
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Newbie                19583 non-null  int64  
 1   Age                   19583 non-null  float64
 2   Gender                19583 non-null  object 
 3   Household Income      19583 non-null  object 
 4   Sexual Preference     19583 non-null  object 
 5   Education Attainment  19583 non-null  object 
 6   Major Occupation      19583 non-null  object 
 7   Marital Status        19240 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 1.2+ MB


In [10]:
newbie['Marital Status'].fillna(newbie['Marital Status'].mode()[0], inplace=True)
newbie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19583 entries, 0 to 19582
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Newbie                19583 non-null  int64  
 1   Age                   19583 non-null  float64
 2   Gender                19583 non-null  object 
 3   Household Income      19583 non-null  object 
 4   Sexual Preference     19583 non-null  object 
 5   Education Attainment  19583 non-null  object 
 6   Major Occupation      19583 non-null  object 
 7   Marital Status        19583 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 1.2+ MB


In [71]:
newbie.isnull().sum()

Newbie                  0
Age                     0
Gender                  0
Household Income        0
Sexual Preference       0
Education Attainment    0
Major Occupation        0
Marital Status          0
dtype: int64

# 범주형 독립변수 데이터 타입 "category"로 변환 후 원핫인코딩

In [11]:
newbie

Unnamed: 0,Newbie,Age,Gender,Household Income,Sexual Preference,Education Attainment,Major Occupation,Marital Status
0,0,54.0,Male,$50-74,Gay male,Some College,Computer,Other
1,0,39.0,Female,Over $100,Heterosexual,Professional,Other,Other
2,1,49.0,Female,$40-49,Heterosexual,Some College,Management,Other
3,1,22.0,Female,$40-49,Heterosexual,Some College,Computer,Married
4,0,20.0,Male,$30-39,Bisexual,Some College,Education,Single
...,...,...,...,...,...,...,...,...
19578,0,22.0,Male,Over $100,Heterosexual,Some College,Education,Single
19579,0,19.0,Male,$50-74,Heterosexual,Some College,Education,Single
19580,0,49.0,Female,$50-74,Heterosexual,Doctoral,Education,Married
19581,1,42.0,Female,$50-74,Heterosexual,Some College,Other,Married


In [12]:
categorical_cols = ['Gender', 'Household Income', 'Sexual Preference',
                    'Education Attainment', 'Major Occupation', 'Marital Status']
numeric_cols = ['Age']

In [13]:
newbie[categorical_cols] = newbie.loc[:, categorical_cols].astype('category')

In [14]:
newbie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19583 entries, 0 to 19582
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   Newbie                19583 non-null  int64   
 1   Age                   19583 non-null  float64 
 2   Gender                19583 non-null  category
 3   Household Income      19583 non-null  category
 4   Sexual Preference     19583 non-null  category
 5   Education Attainment  19583 non-null  category
 6   Major Occupation      19583 non-null  category
 7   Marital Status        19583 non-null  category
dtypes: category(6), float64(1), int64(1)
memory usage: 422.3 KB


## MinMax Scaling과 OneHot Encoding 수행

In [61]:
dfp = DataFramePreprocessor()
mms = MinMaxScaler()
ohe = OneHotEncoder(dtype="uint8")

In [62]:
df_X, tfs = dfp.fit_transform_multiple_transformer(newbie, [mms, ohe], [numeric_cols, categorical_cols])
df_X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19583 entries, 0 to 19582
Data columns (total 37 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Age                                19583 non-null  float64
 1   Gender_Female                      19583 non-null  uint8  
 2   Gender_Male                        19583 non-null  uint8  
 3   Household Income_$10-19            19583 non-null  uint8  
 4   Household Income_$20-29            19583 non-null  uint8  
 5   Household Income_$30-39            19583 non-null  uint8  
 6   Household Income_$40-49            19583 non-null  uint8  
 7   Household Income_$50-74            19583 non-null  uint8  
 8   Household Income_$75-99            19583 non-null  uint8  
 9   Household Income_Over $100         19583 non-null  uint8  
 10  Household Income_Under $10         19583 non-null  uint8  
 11  Sexual Preference_Bisexual         19583 non-null  uin

In [63]:
df_y = newbie['Newbie']

# 데이터 분리

In [68]:
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.33, random_state=1)
X_train.head()

Unnamed: 0,Age,Gender_Female,Gender_Male,Household Income_$10-19,Household Income_$20-29,Household Income_$30-39,Household Income_$40-49,Household Income_$50-74,Household Income_$75-99,Household Income_Over $100,...,Major Occupation_Education,Major Occupation_Management,Major Occupation_Other,Major Occupation_Professional,Marital Status_Divorced,Marital Status_Married,Marital Status_Other,Marital Status_Separated,Marital Status_Single,Marital Status_Widowed
17107,0.706667,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
112,0.546667,1,0,0,0,0,0,0,1,0,...,0,1,0,0,0,1,0,0,0,0
17900,0.373333,0,1,0,0,0,0,1,0,0,...,1,0,0,0,0,1,0,0,0,0
8497,0.306667,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
16655,0.32,1,0,0,0,1,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0


# 로지스틱 회귀 수행

In [65]:
log_clf = LogisticRegression()
log_clf.fit(X_train, y_train)
log_clf.score(X_test, y_test)

0.7630638297872341