## 讀取資料

In [1]:
import pandas
df = pandas.read_csv('https://github.com/ywchiu/tibamedl/raw/master/Data/customer_churn.csv', index_col=0, header = 0)
df.head()

Unnamed: 0,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,churn
1,KS,128,area_code_415,no,yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,no
2,OH,107,area_code_415,no,yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,no
3,NJ,137,area_code_415,no,no,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,no
4,OH,84,area_code_408,yes,no,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,no
5,OK,75,area_code_415,yes,no,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,no


## 資料前處理

In [0]:
df = df.iloc[:,3:]
cat_var = ['international_plan','voice_mail_plan', 'churn']

for var in cat_var:
    df[var] = df[var].map(lambda e: 1 if e == 'yes' else 0)
y = df.iloc[:,-1]
x = df.iloc[:,:-1]

## 分為訓練與測試資料集

In [0]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.33, random_state = 123)

## 資料標準化

In [0]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.fit_transform(x_test)

## K-Fold 交叉驗證

In [0]:
%tensorflow_version 2.x

In [0]:
import tensorflow as tf

In [0]:
def buildClassifier(optimizer):
  model = tf.keras.models.Sequential([                                 
  tf.keras.layers.Dense(8, activation='relu', input_dim = 16),
  tf.keras.layers.Dense(8, activation='relu'),
  tf.keras.layers.Dense(1, activation='sigmoid')
  ])
  model.compile(optimizer=optimizer,
                loss='binary_crossentropy',
                metrics=['accuracy'])
  return model

In [22]:
x_train.shape
y_train.shape

(2233,)

In [24]:
from sklearn.model_selection import cross_val_score
classifier = tf.keras.wrappers.scikit_learn.KerasClassifier(build_fn = buildClassifier, 
                                                            batch_size = 10, 
                                                            epochs = 10, 
                                                            verbose = 0, 
                                                            optimizer = 'sgd')
accuracies = cross_val_score(estimator = classifier, X = x_train, y = y_train, cv = 5)
mean = accuracies.mean()
variance = accuracies.std()

ERROR! Session/line number was not unique in database. History logging moved to new session 59


In [25]:
accuracies

array([0.85682327, 0.85458612, 0.87472034, 0.85426009, 0.86322868])

In [26]:
mean, variance

(0.8607236981391907, 0.00770509215434816)

## Dropout

In [0]:
def buildClassifierWithDropout(optimizer):
  model = tf.keras.models.Sequential([                                 
  tf.keras.layers.Dense(8, activation='relu', input_dim = 16),
  tf.keras.layers.Dropout(0.1),
  tf.keras.layers.Dense(8, activation='relu'),
  tf.keras.layers.Dropout(0.1),
  tf.keras.layers.Dense(1, activation='sigmoid')
  ])
  model.compile(optimizer=optimizer,
                loss='binary_crossentropy',
                metrics=['accuracy'])
  return model

In [0]:
classifier = tf.keras.wrappers.scikit_learn.KerasClassifier(
    build_fn = buildClassifierWithDropout, 
    batch_size = 10, 
    epochs = 100, 
    verbose = 0, 
    optimizer='sgd' )
accuracies = cross_val_score(estimator = classifier, X = x_train, y = y_train, cv = 5)
mean = accuracies.mean()
variance = accuracies.std()

In [29]:
mean,variance

(0.919390869140625, 0.007230649075353533)

## Grid Search

In [0]:
from sklearn.model_selection import GridSearchCV
classifier = tf.keras.wrappers.scikit_learn.KerasClassifier(
    build_fn = buildClassifierWithDropout, 
    epochs = 10,verbose = 0)
parameters = {'batch_size': [10, 15],
              'optimizer': ['adam', 'rmsprop']}
grid_search = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 2)
grid_search     = grid_search.fit(x_train, y_train)
best_parameters = grid_search.best_params_
best_accuracy   = grid_search.best_score_

## Randomized Search

In [35]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
iris = load_iris()
logistic = LogisticRegression(solver='saga', tol=1e-2, max_iter=200,
                               random_state=0)
distributions = dict(C=uniform(loc=0, scale=4),
                      penalty=['l2', 'l1'])
clf = RandomizedSearchCV(logistic, distributions, random_state=0)
search = clf.fit(iris.data, iris.target)
search.best_params_

ERROR! Session/line number was not unique in database. History logging moved to new session 60


{'C': 2.195254015709299, 'penalty': 'l1'}