JMJPFU

18-Mar-2020

Lord bless this attempt of yours
### Experimenting with autoencoders for Telecom churn use case



In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline

pd.set_option("display.max_columns", 300)
pd.set_option("display.max_rows", 300)

In [2]:
# read data
churn = pd.read_csv("D:/JMJTL/JMJTL_MLP/CustomerChurn/Datafiles/telecom_churn_data.csv")

In [3]:
# create backup of data
original = churn.copy()

In [4]:
# create column name list by types of columns
id_cols = ['mobile_number', 'circle_id']

date_cols = ['last_date_of_month_6',
             'last_date_of_month_7',
             'last_date_of_month_8',
             'last_date_of_month_9',
             'date_of_last_rech_6',
             'date_of_last_rech_7',
             'date_of_last_rech_8',
             'date_of_last_rech_9',
             'date_of_last_rech_data_6',
             'date_of_last_rech_data_7',
             'date_of_last_rech_data_8',
             'date_of_last_rech_data_9'
            ]

cat_cols =  ['night_pck_user_6',
             'night_pck_user_7',
             'night_pck_user_8',
             'night_pck_user_9',
             'fb_user_6',
             'fb_user_7',
             'fb_user_8',
             'fb_user_9'
            ]

num_cols = [column for column in churn.columns if column not in id_cols + date_cols + cat_cols]

# print the number of columns in each list
print("#ID cols: %d\n#Date cols:%d\n#Numeric cols:%d\n#Category cols:%d" % (len(id_cols), len(date_cols), len(num_cols), len(cat_cols)))

# check if we have missed any column or not
print(len(id_cols) + len(date_cols) + len(num_cols) + len(cat_cols) == churn.shape[1])

#ID cols: 2
#Date cols:12
#Numeric cols:204
#Category cols:8
True


In [5]:
# some recharge columns have minimum value of 1 while some don't
recharge_cols = ['total_rech_data_6', 'total_rech_data_7', 'total_rech_data_8', 'total_rech_data_9',
                 'count_rech_2g_6', 'count_rech_2g_7', 'count_rech_2g_8', 'count_rech_2g_9',
                 'count_rech_3g_6', 'count_rech_3g_7', 'count_rech_3g_8', 'count_rech_3g_9',
                 'max_rech_data_6', 'max_rech_data_7', 'max_rech_data_8', 'max_rech_data_9',
                 'av_rech_amt_data_6', 'av_rech_amt_data_7', 'av_rech_amt_data_8', 'av_rech_amt_data_9',
                 ]

In [6]:
# create a list of recharge columns where we will impute missing values with zeroes
zero_impute = ['total_rech_data_6', 'total_rech_data_7', 'total_rech_data_8', 'total_rech_data_9',
        'av_rech_amt_data_6', 'av_rech_amt_data_7', 'av_rech_amt_data_8', 'av_rech_amt_data_9',
        'max_rech_data_6', 'max_rech_data_7', 'max_rech_data_8', 'max_rech_data_9'
       ]

In [7]:
# impute missing values with 0
churn[zero_impute] = churn[zero_impute].apply(lambda x: x.fillna(0))

In [8]:
# drop id and date columns
print("Shape before dropping: ", churn.shape)
churn = churn.drop(id_cols + date_cols, axis=1)
print("Shape after dropping: ", churn.shape)

Shape before dropping:  (99999, 226)
Shape after dropping:  (99999, 212)


In [9]:
# replace missing values with '-1' in categorical columns
churn[cat_cols] = churn[cat_cols].apply(lambda x: x.fillna(-1))

In [10]:
initial_cols = churn.shape[1]

MISSING_THRESHOLD = 0.7

include_cols = list(churn.apply(lambda column: True if column.isnull().sum()/churn.shape[0] < MISSING_THRESHOLD else False))

drop_missing = pd.DataFrame({'features':churn.columns , 'include': include_cols})
drop_missing.loc[drop_missing.include == True,:]

Unnamed: 0,features,include
0,loc_og_t2o_mou,True
1,std_og_t2o_mou,True
2,loc_ic_t2o_mou,True
3,arpu_6,True
4,arpu_7,True
5,arpu_8,True
6,arpu_9,True
7,onnet_mou_6,True
8,onnet_mou_7,True
9,onnet_mou_8,True


In [11]:
# drop columns
churn = churn.loc[:, include_cols]

dropped_cols = churn.shape[1] - initial_cols
print("{0} columns dropped.".format(dropped_cols))

-16 columns dropped.


In [12]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [13]:
churn_imputed = IterativeImputer(max_iter=1).fit_transform(churn)

In [15]:
churn_cols = churn.columns
churn = pd.DataFrame(churn_imputed, columns=churn_cols)
print(churn.isnull().sum()*100/churn.shape[0])

loc_og_t2o_mou        0.0
std_og_t2o_mou        0.0
loc_ic_t2o_mou        0.0
arpu_6                0.0
arpu_7                0.0
arpu_8                0.0
arpu_9                0.0
onnet_mou_6           0.0
onnet_mou_7           0.0
onnet_mou_8           0.0
onnet_mou_9           0.0
offnet_mou_6          0.0
offnet_mou_7          0.0
offnet_mou_8          0.0
offnet_mou_9          0.0
roam_ic_mou_6         0.0
roam_ic_mou_7         0.0
roam_ic_mou_8         0.0
roam_ic_mou_9         0.0
roam_og_mou_6         0.0
roam_og_mou_7         0.0
roam_og_mou_8         0.0
roam_og_mou_9         0.0
loc_og_t2t_mou_6      0.0
loc_og_t2t_mou_7      0.0
loc_og_t2t_mou_8      0.0
loc_og_t2t_mou_9      0.0
loc_og_t2m_mou_6      0.0
loc_og_t2m_mou_7      0.0
loc_og_t2m_mou_8      0.0
loc_og_t2m_mou_9      0.0
loc_og_t2f_mou_6      0.0
loc_og_t2f_mou_7      0.0
loc_og_t2f_mou_8      0.0
loc_og_t2f_mou_9      0.0
loc_og_t2c_mou_6      0.0
loc_og_t2c_mou_7      0.0
loc_og_t2c_mou_8      0.0
loc_og_t2c_m

In [16]:
churn.shape

(99999, 196)

In [18]:
# calculate total incoming and outgoing minutes of usage
churn['total_calls_mou_9'] = churn.total_ic_mou_9 + churn.total_og_mou_9

# calculate 2g and 3g data consumption
churn['total_internet_mb_9'] =  churn.vol_2g_mb_9 + churn.vol_3g_mb_9

# create churn variable: those who have not used either calls or internet in the month of September are customers who have churned

# 0 - not churn, 1 - churn
churn['churn'] = churn.apply(lambda row: 1 if (row.total_calls_mou_9 == 0 and row.total_internet_mb_9 == 0) else 0, axis=1)

# delete derived variables
churn_filtered = churn.drop(['total_calls_mou_9', 'total_internet_mb_9'], axis=1)

# change data type to category
churn_filtered.churn = churn_filtered.churn.astype("category")

# print churn ratio
print("Churn Ratio:")
print(churn_filtered.churn.value_counts()*100/churn_filtered.shape[0])

Churn Ratio:
0    89.808898
1    10.191102
Name: churn, dtype: float64


In [19]:
churn_filtered.shape

(99999, 197)

In [20]:
# delete all variables relating to 9th month
churn_filtered = churn_filtered.filter(regex='[^9]$', axis=1)
churn_filtered.shape

(99999, 150)

In [21]:
# extract all names that end with 9
col_9_names = churn.filter(regex='9$', axis=1).columns

# update num_cols and cat_cols column name list
cat_cols = [col for col in cat_cols if col not in col_9_names]
cat_cols.append('churn')
num_cols = [col for col in churn_filtered.columns if col not in cat_cols]

In [23]:
churn_filtered.columns

Index(['loc_og_t2o_mou', 'std_og_t2o_mou', 'loc_ic_t2o_mou', 'arpu_6',
       'arpu_7', 'arpu_8', 'onnet_mou_6', 'onnet_mou_7', 'onnet_mou_8',
       'offnet_mou_6',
       ...
       'sachet_3g_8', 'fb_user_6', 'fb_user_7', 'fb_user_8', 'aon',
       'aug_vbc_3g', 'jul_vbc_3g', 'jun_vbc_3g', 'sep_vbc_3g', 'churn'],
      dtype='object', length=150)

In [32]:
dataTypes = churn_filtered.dtypes
set(dataTypes)

{dtype('float64')}

In [25]:
# Creating the target variable

y = churn_filtered.pop('churn')
y.shape

(99999,)

In [29]:
X = churn_filtered
X.columns

Index(['loc_og_t2o_mou', 'std_og_t2o_mou', 'loc_ic_t2o_mou', 'arpu_6',
       'arpu_7', 'arpu_8', 'onnet_mou_6', 'onnet_mou_7', 'onnet_mou_8',
       'offnet_mou_6',
       ...
       'sachet_3g_7', 'sachet_3g_8', 'fb_user_6', 'fb_user_7', 'fb_user_8',
       'aon', 'aug_vbc_3g', 'jul_vbc_3g', 'jun_vbc_3g', 'sep_vbc_3g'],
      dtype='object', length=149)

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler , OneHotEncoder
from sklearn.compose import ColumnTransformer

In [30]:
# This is the method to transform numerical and categorical data
categorical_transformer = Pipeline(steps = [('onehot',OneHotEncoder(handle_unknown='ignore'))])
# The below needs to be parametrised
numeric_transformer = Pipeline(steps=[('scaler',StandardScaler())])
numeric_features = X.select_dtypes(include=['int64','float64']).columns
categorical_features = X.select_dtypes(include=['category']).columns
preprocessor = ColumnTransformer(transformers=[('numeric',numeric_transformer,numeric_features),('Categorical',categorical_transformer,categorical_features)])
X_tran = pd.DataFrame(preprocessor.fit_transform(X))

In [31]:
X_tran.shape

(99999, 149)

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X_tran, y, test_size = 0.25, random_state = 4, stratify = y)
X_train.shape
y_train.shape
X_test.shape
y_test.shape

(74999, 149)

(74999,)

(25000, 149)

(25000,)

In [42]:
# Taking only few examples of churn
validIdxs = np.where(y_train == 0)[0]
anomalyIdxs = np.where(y_train == 1)[0]

len(validIdxs)
len(anomalyIdxs)


67356

7643

In [45]:
len(validIdxs)
len(anomalyIdxs)

67356

673

In [46]:
random.shuffle(validIdxs)
random.shuffle(anomalyIdxs)
# compute the total number of anomaly data points to select
i = int(len(validIdxs) * 0.01)
anomalyIdxs = anomalyIdxs[:i]
# use NumPy array indexing to extract both the valid images and
# "anomlay" images
validData = X_train.iloc[validIdxs]
anomalyData = X_train.iloc[anomalyIdxs]

In [48]:
validData.shape
anomalyData.shape

(67356, 149)

(673, 149)

In [50]:
X = np.vstack([validData, anomalyData])
np.random.seed(123)
np.random.shuffle(X)
X.shape

(68029, 149)

In [64]:
X_test.shape
X_new = np.expand_dims(X, axis=1)
X_new.shape

(68029, 1, 149)

In [65]:
XTrain, XTest = train_test_split(X_new,test_size = 0.25, random_state = 4)
XTrain.shape
XTest.shape

(51021, 1, 149)

(17008, 1, 149)

#### Anomaly detection in the Churn data set

In [36]:
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Conv1D
#from tensorflow.keras.layers import Conv1DTranspose
from tensorflow.keras.layers import LeakyReLU
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Reshape
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import MaxPooling1D
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
import numpy as np
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import numpy as np
import argparse
import random
import pickle
import cv2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.datasets import mnist
from sklearn.model_selection import train_test_split

In [66]:
inputShape = (1,149)
inputs = Input(shape=inputShape)
x = inputs
for f in (32,64):
    # apply a CONV => RELU => BN operation
    x = Conv1D(f, kernel_size=3, strides=2, padding="same")(x)
    x = LeakyReLU(alpha=0.2)(x)
    x = BatchNormalization()(x)
# flatten the network and then construct our latent vector
volumeSize = K.int_shape(x)
x = Flatten()(x)
latent = Dense(16)(x)
# build the encoder model
encoder = Model(inputs, latent, name="encoder")

In [67]:
encoder.summary()

Model: "encoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, 1, 149)]          0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 1, 32)             14336     
_________________________________________________________________
leaky_re_lu_2 (LeakyReLU)    (None, 1, 32)             0         
_________________________________________________________________
batch_normalization_v2_2 (Ba (None, 1, 32)             128       
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 1, 64)             6208      
_________________________________________________________________
leaky_re_lu_3 (LeakyReLU)    (None, 1, 64)             0         
_________________________________________________________________
batch_normalization_v2_3 (Ba (None, 1, 64)             256 

In [68]:
volumeSize

(None, 1, 64)

In [75]:
from tensorflow.keras.layers import Conv2DTranspose, Lambda


def Conv1DTranspose(input_tensor, filters, kernel_size, strides=2, padding='same'):
    """
        input_tensor: tensor, with the shape (batch_size, time_steps, dims)
        filters: int, output dimension, i.e. the output tensor will have the shape of (batch_size, time_steps, filters)
        kernel_size: int, size of the convolution kernel
        strides: int, convolution step size
        padding: 'same' | 'valid'
    """
    x = Lambda(lambda x: K.expand_dims(x, axis=2))(input_tensor)
    x = Conv2DTranspose(filters=filters, kernel_size=(kernel_size, 1), strides=(strides, 1), padding=padding)(x)
    x = Lambda(lambda x: K.squeeze(x, axis=2))(x)
    return x

In [107]:
# Building the decoder
latentInputs = Input(shape=(16,))
x = Dense(np.prod(volumeSize[1:]))(latentInputs)
x = Reshape((volumeSize[1], volumeSize[2]))(x)
for f in (64,32):
    # apply a CONV_TRANSPOSE => RELU => BN operation
    x = Dense(f)(x)
    #x = Conv1DTranspose(x, f, 3, strides=2, padding='same')
    x = LeakyReLU(alpha=0.2)(x)
    x = BatchNormalization()(x)

In [108]:
#x = Conv1DTranspose(x, 149,3,strides=1, padding="same")
x = Dense(149)(x)

In [109]:
outputs = Activation("sigmoid")(x)
# build the decoder model
decoder = Model(latentInputs, outputs, name="decoder")

In [110]:
decoder.summary()

Model: "decoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_19 (InputLayer)        [(None, 16)]              0         
_________________________________________________________________
dense_16 (Dense)             (None, 64)                1088      
_________________________________________________________________
reshape_12 (Reshape)         (None, 1, 64)             0         
_________________________________________________________________
dense_17 (Dense)             (None, 1, 64)             4160      
_________________________________________________________________
leaky_re_lu_22 (LeakyReLU)   (None, 1, 64)             0         
_________________________________________________________________
batch_normalization_v2_22 (B (None, 1, 64)             256       
_________________________________________________________________
dense_18 (Dense)             (None, 1, 32)             2080

In [111]:
autoencoder = Model(inputs, decoder(encoder(inputs)),name="autoencoder")

In [112]:
autoencoder.summary()

Model: "autoencoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, 1, 149)]          0         
_________________________________________________________________
encoder (Model)              (None, 16)                21968     
_________________________________________________________________
decoder (Model)              (None, 1, 149)            12629     
Total params: 34,597
Trainable params: 34,213
Non-trainable params: 384
_________________________________________________________________


In [114]:
EPOCHS = 20
INIT_LR = 1e-3
BS = 32
opt = Adam(lr=INIT_LR, decay=INIT_LR / EPOCHS)
autoencoder.compile(loss="mse", optimizer=opt)

In [115]:
# train the convolutional autoencoder
H = autoencoder.fit(
	XTrain, XTrain,
	validation_data=(XTest, XTest),
	epochs=EPOCHS,
	batch_size=BS)

Train on 51021 samples, validate on 17008 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [116]:
X_test.shape
X_test_new = np.expand_dims(X_test, axis=1)
X_test_new.shape

(25000, 149)

(25000, 1, 149)

In [117]:
print("[INFO] making predictions...")
decoded = autoencoder.predict(X_test_new)

[INFO] making predictions...


In [119]:
decoded.shape

(25000, 1, 149)

Finding the difference between the real images and the reconstructed images

In [120]:
errors = []
# loop over all original images and their corresponding
# reconstructions
for (image, recon) in zip(X_test_new, decoded):
	# compute the mean squared error between the ground-truth image
	# and the reconstructed image, then add it to our list of errors
	mse = np.mean((image - recon) ** 2)
	errors.append(mse)

In [167]:
# compute the q-th quantile of the errors which serves as our
# threshold to identify anomalies -- any data point that our model
# reconstructed with > threshold error will be marked as an outlier
thresh = np.quantile(errors, 0.9999)
idxs = np.where(np.array(errors) >= thresh)[0]
print("[INFO] mse threshold: {}".format(thresh))
print("[INFO] {} outliers found".format(len(idxs)))

[INFO] mse threshold: 127.67552603798416
[INFO] 3 outliers found


In [168]:
idxs

array([  105,   781, 24780], dtype=int64)

In [147]:
y_new = pd.DataFrame(y_test)
y = y_new.reset_index(inplace=True)
y

In [169]:
[y_new.iloc[idxs]['churn']==1]

[105      False
 781      False
 24780    False
 Name: churn, dtype: bool]