## Telecom User Data Analysis

### Libraries

In [None]:
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import pandas as pd
from functools import reduce
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from statistics import mean
from pandas_profiling import ProfileReport
import numpy as np
import json
import datetime
import math
import statsmodels.api as sm

from datetime import timedelta, datetime

import matplotlib.mlab as mlab
import matplotlib
plt.style.use('ggplot')
from matplotlib.pyplot import figure

%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12,8)

pd.options.mode.chained_assignment = None

import seaborn as sns


### Data Preprocessing

In [None]:
df = pd.read_csv("../data/telcom.csv")

In [None]:
df.head()

In [None]:
### Check for sufficiency of data to answer objectives by reviewing the objective and 
### available data. And check the number of observations and required sample size.

In [None]:
# generate pipelines
def generate_pipeline(type_="numeric",x=1):
    pipeline = None
    if type_ == "numeric":
        pipeline = Pipeline(steps=[
            ('impute', SimpleImputer(strategy='mean')),
            ('scale', MinMaxScaler())
        ])
    elif type_ == "categorical":
        pipeline = Pipeline(steps=[
        ('impute', SimpleImputer(strategy='most_frequent')),
        ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse=False))
        ])
    else:
        pipeline = np.zeros(x)
    return pipeline



In [None]:
# test pipeline
numeric_pipeline = generate_pipeline("numeric")
assert isinstance(numeric_pipeline,Pipeline)

In [None]:
categorical_pipeline = generate_pipeline("categorical")
assert isinstance(numeric_pipeline,Pipeline)

In [None]:
def store_features(df,type_,value):
    features = [None]
    if type_ == "numeric":
        features = df.select_dtypes(include=value).columns.tolist()
    elif type_ == "categorical":
        features = df.select_dtypes(exclude=value).columns.tolist()
    return features


In [None]:
categorical_features = store_features(df,"categorical","number")

In [None]:
# testing function
numerical_features = store_features(df,"numeric","number")

In [None]:
len(numerical_features)

In [None]:
assert len(numerical_features)>0

In [None]:
### checking for duplicates

In [None]:
def generate_transformation(pipeline,df,type_,value):
    transformation = None
    if type_=="numeric":
        transformation=pipeline.fit_transform(df.select_dtypes(include=value))
    elif type_ == "categorical":
        transformation=pipeline.fit_transform(df.select_dtypes(exclude=value))
    return transformation

In [None]:
def frame_transforms(transform,features):
    return pd.DataFrame(transform,columns=features)

In [None]:
def handle_missing_values_numeric(df,features):
    """
    this algorithm does the following
    - remove columns with x percentage of missing values
    - fill the missing values with the mean
    returns:
        - df
        - percentage of missing values
    """
    missing_percentage = round((df.isnull().sum().sum()/\
            reduce(lambda x, y: x*y, df.shape))*100,2)
    for key in features:
        df[key] = df[key].fillna(df[key].mean())
    return missing_percentage, df

In [None]:
handle_missing_values_numeric(df,numerical_features)

In [None]:
def handle_missing_values_categorical(df,features):
    """
    this algorithm does the following
    - remove columns with x percentage of missing values
    - fill the missing values with the mode
    returns:
        - df
        - percentage of missing values
    """
    missing_percentage = round((df.isnull().sum().sum()/\
            reduce(lambda x, y: x*y, df.shape))*100,2)
    for key in features:
        df[key] = df[key].fillna(df[key].mode()[0])
    return missing_percentage, df

In [None]:
handle_missing_values_categorical(df,categorical_features)

In [None]:
# test transformations
numeric_transformation = generate_transformation(numeric_pipeline,df,"numeric","number")

In [None]:
numeric_df = frame_transforms(numeric_transformation,numerical_features)

In [None]:
def split_data(df,response_variable,split_ratio,get):
    X = df.drop(response_variable, axis=1)
    y = df[response_variable]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_ratio, 
                                                    random_state=1121218)
    if get == "X_train":
        return X_train
    elif get == "X_test":
        return X_test
    elif get == "y_train":
        return y_train
    else:
        return y_test
    
    

In [None]:
numeric_transformation_ = generate_transformation(numeric_pipeline,
                                                 pd.DataFrame(split_data(df,"Total DL (Bytes)",0.3,"X_train")),
                                                 "numeric","number")

In [None]:
### Check for column organization and naming and correct them.

In [None]:
### Check for missing values and either fill them with a value that will, in no way, 
### affect the analysis or remove the record.

In [None]:
def top_x_column(df, x, column,color,online=False):
    handsets_df = pd.DataFrame(columns = [column])
    handsets_df['type'] = df[column].to_list()
    handsets = handsets_df['type'].value_counts()
    fig,ax = plt.subplots()
    ax.tick_params(axis='x',labelsize=10)
    ax.tick_params(axis='y',labelsize=10)
    ax.set_xlabel(column)
    ax.set_ylabel("Frequency")
    ax.set_title(f"The {x} Most Frequent {column}")
    handsets[:x].plot(ax=ax,kind='bar',color=color)
    handset_counts = handsets.to_dict()
    top_x = list(handset_counts.keys())
    if online:
        return fig,top_x[:x]
    else:
        return top_x[:x]
    

In [None]:
# identifying the top 10 handsets used by the customers.
values = top_x_column(df,10,"Handset Manufacturer","purple")

In [None]:
values = top_x_column(df,3,"Handset Type","green")

In [None]:
values

In [None]:
# Identify the top 5 handsets per top 3 handset manufacturer
def top_x_by_y_cols(df,col_1,col_2,x,y):
    result_df = []
    by_manufacture = df.groupby(col_1,sort=True)
    values = top_x_column(df,x,col_1,"purple")


    for manufacturer, frame in by_manufacture:
        if manufacturer in values:
            result_df.append(frame.sort_values(by=[col_2], ascending=True)[col_2].head(5))
    return result_df
    

In [None]:
top_x_by_y_cols(df,'Handset Manufacturer','Handset Type',3,5)

In [None]:
# Aggregation per user the following information in the column  
# number of xDR sessions
# Session duration
# the total download (DL) and upload (UL) data
# the total data volume (in Bytes) during this session for each application


def aggregation_cols(df,col_1,col_2,trim=False):
    
    grouped = df.groupby(col_1).agg({col_2: [min, max, mean]}) 
    grouped.columns = ["_".join(x) for x in grouped.columns.ravel()]
    if trim:
        return grouped.describe()
    return grouped

In [None]:
aggregation_cols(df,'MSISDN/Number','Bearer Id',True)

In [None]:
aggregation_cols(df,'MSISDN/Number',"Dur. (ms)",True)

In [None]:
aggregation_cols(df,'MSISDN/Number','Total UL (Bytes)',True)

In [None]:
aggregation_cols(df,'MSISDN/Number','Total DL (Bytes)',True)

## EDA Analysis

In [None]:
#Describe all  relevant variables and associated data types
# Analyze the basic metrics (mean, median, etc) in the Dataset (explain) & 
# their importance for the global objective.
# Conduct a Non-Graphical Univariate Analysis by computing dispersion parameters 
# for each quantitative variable and provide useful interpretation.

def non_graphical_analysis(df,features,type_,opt,x_=1,y_=1):
    result = None
    if type_ == "univariate":
        for i,key in enumerate(features):
            if i == x_:
                result = pd.DataFrame(df[key].describe())
    elif type_ == "bivariate":
        for i,key in enumerate(features):
            if i == x_:
                if opt=="regression":
                    y = df[features[y_]]
                    x = df[[key]]
                    x = sm.add_constant(x)
                    model = sm.OLS(y, x).fit()
                    result =  model.summary()
                elif opt=="corr":
                    result = pd.DataFrame(df[[key,features[y_]]].corr())
    elif type_ == "multivariate":
        result = pd.DataFrame(df[features].corr())
    return result
        

In [None]:
non_graphical_analysis(numeric_df,numerical_features,"univariate",1)

In [None]:
non_graphical_analysis(numeric_df,numerical_features,"univariate",2)

In [None]:
non_graphical_analysis(numeric_df,numerical_features,"univariate",3)

In [None]:
non_graphical_analysis(numeric_df,numerical_features,"bivariate","regression",36,49)

In [None]:
non_graphical_analysis(numeric_df,numerical_features,"bivariate","corr",1,4)

In [None]:
# Conduct a Graphical Univariate Analysis by identifying the most suitable plotting options 
# for each variable and interpret your findings.

def graphical_analysis(df,features,type_,opt,x=1,y=1):
    result = None
    if type_ == "univariate":
        for i,key in enumerate(features):
            if i == x:
                if opt == 'box':
                    return df.boxplot(column=[key], grid=False, color='black')
                elif opt == 'hist':
                    return df.hist(column=[key], grid=False, edgecolor='black')
                elif opt == 'curve':
                    return sns.kdeplot(df[key])
    if type_ == "bivariate":
        for i,key in enumerate(features):
            if i == x:
                if opt == "scatter":
                    plt.scatter(df[features[x]], df[features[y]])
                    plt.title(f'{features[x]} vs {features[y]}')
                    plt.xlabel(f'{features[x]}')
                    plt.ylabel(f'{features[y]}')
                
    

In [None]:
graphical_analysis(numeric_df,numerical_features,"univariate","hist",x=1)

In [None]:
graphical_analysis(numeric_df,numerical_features,"univariate","hist",x=2)

In [None]:
graphical_analysis(numeric_df,numerical_features,"univariate","hist",x=3)

In [None]:
graphical_analysis(numeric_df,numerical_features,"univariate","box",1)

In [None]:
graphical_analysis(numeric_df,numerical_features,"univariate","box",x=2)

In [None]:
graphical_analysis(numeric_df,numerical_features,"univariate","box",x=3)

In [None]:
graphical_analysis(numeric_df,numerical_features,"univariate","box",x=4)

In [None]:
graphical_analysis(numeric_df,numerical_features,"univariate","curve",x=1)

In [None]:
graphical_analysis(numeric_df,numerical_features,"univariate","curve",x=2)

In [None]:
graphical_analysis(numeric_df,numerical_features,"univariate","curve",x=3)

In [None]:
graphical_analysis(numeric_df,numerical_features,"univariate","curve",x=4)

In [None]:
# Bivariate Analysis – explore the relationship between each application & 
# the total DL+UL data using appropriate methods and interpret your findings. 
graphical_analysis(numeric_df,numerical_features,"bivariate","scatter",x=34,y=49)

In [None]:
graphical_analysis(numeric_df,numerical_features,"bivariate","scatter",x=36,y=49)

In [None]:
graphical_analysis(numeric_df,numerical_features,"bivariate","scatter",x=37,y=49)

In [None]:
# Correlation Analysis – compute a correlation matrix for the following variables and interpret your 
# findings: Social Media data, Google data, Email data, Youtube data, Netflix data, Gaming data, Other data 
non_graphical_analysis(numeric_df,numerical_features,"multivariate",1,4)

In [None]:
# pca analysis
def setup_pca(data,n):
    pca = PCA(n)
    x_ = pca.fit_transform(data)
    return x_, pca

In [None]:
def pca_analysis(df,features,no,x_,component):
    for i,key in enumerate(features):
        if i==x_:
            train = generate_transformation(numeric_pipeline,
                                         pd.DataFrame(split_data(df,key,0.3,"X_train")),
                                         "numeric","number")
            test = generate_transformation(numeric_pipeline,
                                             pd.DataFrame(split_data(df,key,0.3,"X_test")),
                                             "numeric","number")
            pca_train_results, pca_train = setup_pca(train, no)
            pca_test_results, pca_test = setup_pca(test, no)
            names_pcas = [f"PCA Component {i}" for i in range(1, 11, 1)]
            scree = pd.DataFrame(list(zip(names_pcas, pca_train.explained_variance_ratio_)), columns=["Component", "Explained Variance Ratio"])
            d = {'PCA':pca_train.components_[component], 'Variable Names':numerical_features[:x_]}
            df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in d.items() ]))            
            df = df.sort_values('PCA', ascending=False)
            df2 = pd.DataFrame(df)
            df2['PCA']=df2['PCA'].apply(np.absolute)
            df2 = df2.sort_values('PCA', ascending=False)
            return df2
    return

In [None]:
pca_analysis(numeric_df,numerical_features,10,49,1)

In [None]:
pca_analysis(numeric_df,numerical_features,10,48,1)

In [None]:
pca_analysis(numeric_df,numerical_features,10,47,1)

In [None]:
pca_analysis(numeric_df,numerical_features,10,34,1)

In [None]:
def categorize_based_on_deciles(df,features,x_):
    for i,key in enumerate(features):
        if i==x_:
            df['decile_rank'] = pd.qcut(df[key], 10,labels = False)
            return df.groupby(['decile_rank']).sum()
    return

In [None]:
categorize_based_on_deciles(numeric_df,numerical_features,39)

In [None]:
categorize_based_on_deciles(numeric_df,numerical_features,49)

In [None]:
categorize_based_on_deciles(numeric_df,numerical_features,34)

In [None]:
def fixing_outliers(df, column):
    df[column] = np.where(df[column] > df[column].quantile(0.95), 
                          df[column].median(),df[column])
    
    return df[column]


In [None]:
numeric_df["Start ms"] = fixing_outliers(numeric_df, "Start ms")