In [3]:
import pandas as pd

project_path = '/home/wjunneng/Python/2019-Construction-And-Forecast-Of-Telecom-Customer-Churn-Model'

train_15p_cache_path = project_path + '/data/cache/train_15p.h5'
train_85p_cache_path = project_path + '/data/cache/train_85p.h5'

train_15p = pd.read_hdf(path_or_buf=train_15p_cache_path, mode='r', key='train_15p')
train_85p = pd.read_hdf(path_or_buf=train_85p_cache_path, mode='r', key='train_85p')

# 剔除Churn属性
columns = list(train_15p.columns)
columns.remove('Churn')

X_test = train_85p
X_train = train_15p[columns]
y_train = train_15p['Churn'].astype(int)

X_test = X_test.fillna(0)
X_train = X_train.fillna(0)



In [4]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score

model_factory = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()
]

for model in model_factory:
    model.seed = 42
    num_folds = 3

    scores = cross_val_score(model, X_train, y_train, cv=num_folds, scoring='r2', n_jobs=8)
    score_description = " %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)

    print('{model:25} CV-5 R2: {score}'.format(
        model=model.__class__.__name__,
        score=score_description
    ))




KNeighborsClassifier      CV-5 R2:  -0.34 (+/- 0.30)
SVC                       CV-5 R2:  -0.15 (+/- 0.03)
SVC                       CV-5 R2:  -0.17 (+/- 0.01)
GaussianProcessClassifier CV-5 R2:  -0.17 (+/- 0.01)
DecisionTreeClassifier    CV-5 R2:  -0.96 (+/- 1.92)
RandomForestClassifier    CV-5 R2:  -0.18 (+/- 0.04)
MLPClassifier             CV-5 R2:  -0.54 (+/- 1.07)
AdaBoostClassifier        CV-5 R2:  -0.38 (+/- 0.37)
GaussianNB                CV-5 R2:  -0.53 (+/- 0.73)
QuadraticDiscriminantAnalysis CV-5 R2:  -0.23 (+/- 0.14)


In [5]:

from sklearn.utils import shuffle
from sklearn.base import BaseEstimator, RegressorMixin

class PseudoLabeler(BaseEstimator, RegressorMixin):
    """
    Sci-kit learn wrapper for creating pseudo-lebeled estimators.
    """
    
    def __init__(self, model, unlabled_data, features, target, sample_rate=0.2, seed=42):
        """
        @sample_rate - percent of samples used as pseudo-labelled data
                       from the unlabled dataset
        """
        assert sample_rate <= 1.0, 'Sample_rate should be between 0.0 and 1.0.'
        
        self.sample_rate = sample_rate
        self.seed = seed
        self.model = model
        self.model.seed = seed
        
        self.unlabled_data = unlabled_data
        self.features = features
        self.target = target
        
    def get_params(self, deep=True):
        return {
            "sample_rate": self.sample_rate,
            "seed": self.seed,
            "model": self.model,
            "unlabled_data": self.unlabled_data,
            "features": self.features,
            "target": self.target
        }

    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self

        
    def fit(self, X, y):
        """
        Fit the data using pseudo labeling.
        """

        augemented_train = self.__create_augmented_train(X, y)
        self.model.fit(
            augemented_train[self.features],
            augemented_train[self.target]
        )
        
        return self


    def __create_augmented_train(self, X, y):
        """
        Create and return the augmented_train set that consists
        of pseudo-labeled and labeled data.
        """        
        num_of_samples = int(len(self.unlabled_data) * self.sample_rate)
        
        # Train the model and creat the pseudo-labels
        self.model.fit(X, y)
        pseudo_labels = self.model.predict(self.unlabled_data[self.features])
        
        # Add the pseudo-labels to the test set
        pseudo_data = self.unlabled_data.copy(deep=True)
        pseudo_data[self.target] = pseudo_labels
        
        # Take a subset of the test set with pseudo-labels and append in onto
        # the training set
        sampled_pseudo_data = pseudo_data.sample(n=num_of_samples)
        temp_train = pd.concat([X, y], axis=1)
        augemented_train = pd.concat([sampled_pseudo_data, temp_train])

        return shuffle(augemented_train)
        
    def predict(self, X):
        """
        Returns the predicted values.
        """
        return self.model.predict(X)
    
    def get_model_name(self):
        return self.model.__class__.__name__




In [6]:
from xgboost import XGBClassifier

model = PseudoLabeler(
    XGBClassifier(nthread=10),
    X_test,
    X_test.columns,
    'Churn'
)

model.fit(X_train, y_train)
predict = model.predict(train_85p)




In [7]:
from xgboost import XGBClassifier
model_factory = [
    XGBClassifier(nthread=10),
    
    PseudoLabeler(
        XGBClassifier(nthread=10),
        X_test,
        X_test.columns,
        'Churn',
        sample_rate=0.3
    ),
]

for model in model_factory:
    model.seed = 42
    num_folds = 8
    
    scores = cross_val_score(model, X_train, y_train, cv=num_folds, scoring='accuracy', n_jobs=8)
    score_description = "R2: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2)

    print('{model:25} CV-{num_folds} {score_cv}'.format(
        model=model.__class__.__name__,
        num_folds=num_folds,
        score_cv=score_description
    ))
    
    
    

XGBClassifier             CV-8 R2: 0.8719 (+/- 0.1045)
PseudoLabeler             CV-8 R2: 0.8800 (+/- 0.1007)


In [12]:
X_test = X_test.fillna(0)
X_train = X_train.fillna(0)

import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from lightgbm import LGBMClassifier

sns.set(color_codes=True)
sample_rates = np.linspace(0, 1, 10)

def pseudo_label_wrapper(model):
    return PseudoLabeler(model, train_85p, train_85p.columns, 'Churn')

# List of all models to test
model_factory = [
    KNeighborsClassifier(3),
    GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
    XGBClassifier(),
    LGBMClassifier()
]

# Apply the PseudoLabeler class to each model
model_factory = map(pseudo_label_wrapper, model_factory)

# Train each model with different sample rates
results = {}
num_folds = 5

for model in model_factory:
    model_name = model.get_model_name()
    print('%s' % model_name)

    results[model_name] = list()
    for sample_rate in sample_rates:
        model.sample_rate = sample_rate
        
        # Calculate the CV-3 R2 score and store it
        scores = cross_val_score(model, X_train, y_train, cv=num_folds, scoring='accuracy', n_jobs=11)
        results[model_name].append(scores.mean())

print(results)
    
plt.figure(figsize=(18, 18))

i = 1
for model_name, performance in results.items():    
    plt.subplot(4, 3, i)
    i += 1
    
    plt.plot(sample_rates, performance)
    plt.title(model_name)
    plt.xlabel('sample_rate')
    plt.ylabel('Accuracy-score')
    

plt.show()

KNeighborsClassifier


KeyboardInterrupt: 