In [1]:
import warnings
from xml.etree.ElementInclude import include
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import pandas as pd
from functools import reduce
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from statistics import mean
import numpy as np
import statsmodels.api as sm


import matplotlib
plt.style.use('ggplot')

matplotlib.rcParams['figure.figsize'] = (12,8)

pd.options.mode.chained_assignment = None

import seaborn as sns


In [2]:
df = pd.read_csv("../data/telcom.csv")

In [3]:
df.columns

Index(['Bearer Id', 'Start', 'Start ms', 'End', 'End ms', 'Dur. (ms)', 'IMSI',
       'MSISDN/Number', 'IMEI', 'Last Location Name', 'Avg RTT DL (ms)',
       'Avg RTT UL (ms)', 'Avg Bearer TP DL (kbps)', 'Avg Bearer TP UL (kbps)',
       'TCP DL Retrans. Vol (Bytes)', 'TCP UL Retrans. Vol (Bytes)',
       'DL TP < 50 Kbps (%)', '50 Kbps < DL TP < 250 Kbps (%)',
       '250 Kbps < DL TP < 1 Mbps (%)', 'DL TP > 1 Mbps (%)',
       'UL TP < 10 Kbps (%)', '10 Kbps < UL TP < 50 Kbps (%)',
       '50 Kbps < UL TP < 300 Kbps (%)', 'UL TP > 300 Kbps (%)',
       'HTTP DL (Bytes)', 'HTTP UL (Bytes)', 'Activity Duration DL (ms)',
       'Activity Duration UL (ms)', 'Dur. (ms).1', 'Handset Manufacturer',
       'Handset Type', 'Nb of sec with 125000B < Vol DL',
       'Nb of sec with 1250B < Vol UL < 6250B',
       'Nb of sec with 31250B < Vol DL < 125000B',
       'Nb of sec with 37500B < Vol UL',
       'Nb of sec with 6250B < Vol DL < 31250B',
       'Nb of sec with 6250B < Vol UL < 37500B',


In [6]:
experience_df = pd.DataFrame({'customer':df['MSISDN/Number'],
                      'sessions_frequency':df['Bearer Id'],
                      'duration':df['Dur. (ms)'],
                        'handset':df['Handset Type']})
experience_df['rtt'] = df['Avg RTT DL (ms)']+df['Avg RTT UL (ms)']
experience_df['tp'] = df['Avg Bearer TP DL (kbps)']+df['Avg Bearer TP UL (kbps)']
experience_df['tcp'] = df['TCP DL Retrans. Vol (Bytes)']+df['TCP UL Retrans. Vol (Bytes)']
experience_df.head(4)

Unnamed: 0,customer,sessions_frequency,duration,handset,rtt,tp,tcp
0,33664960000.0,1.31145e+19,1823652.0,Samsung Galaxy A5 Sm-A520F,47.0,67.0,
1,33681850000.0,1.31145e+19,1365104.0,Samsung Galaxy J5 (Sm-J530),70.0,42.0,
2,33760630000.0,1.31145e+19,1361762.0,Samsung Galaxy A8 (2018),,15.0,
3,33750340000.0,1.31145e+19,1321509.0,undefined,,88.0,


Task 3. 1 - Aggregate, per customer, the following information (treat missing & outliers by replacing by the mean or the mode of the corresponding variable):
Average TCP retransmission
Average RTT
Handset type
Average throughput


In [8]:
def generate_pipeline(type_="numeric",x=1):
    pipeline = None
    if type_ == "numeric":
        pipeline = Pipeline(steps=[
            ('impute', SimpleImputer(strategy='mean')),
            ('scale', MinMaxScaler())
        ])
    elif type_ == "categorical":
        pipeline = Pipeline(steps=[
        ('impute', SimpleImputer(strategy='most_frequent')),
        ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse=False))
        ])
    else:
        pipeline = np.zeros(x)
    return pipeline

In [9]:
numeric_pipeline = generate_pipeline("numeric")
assert isinstance(numeric_pipeline,Pipeline)

In [10]:
categorical_pipeline = generate_pipeline("categorical")
assert isinstance(numeric_pipeline,Pipeline)

In [11]:
def store_features(df,type_,value):
    features = [None]
    if type_ == "numeric":
        features = df.select_dtypes(include=value).columns.tolist()
    elif type_ == "categorical":
        features = df.select_dtypes(exclude=value).columns.tolist()
    return features


In [12]:
categorical_features = store_features(df,"categorical","number")

In [13]:
numerical_features = store_features(df,"numeric","number")

In [14]:
def handle_missing_values_numeric(df,features):
    """
    this algorithm does the following
    - remove columns with x percentage of missing values
    - fill the missing values with the mean
    returns:
        - df
        - percentage of missing values
    """
    missing_percentage = round((df.isnull().sum().sum()/\
            reduce(lambda x, y: x*y, df.shape))*100,2)
    for key in features:
        df[key] = df[key].fillna(df[key].mean())
    return missing_percentage, df

In [15]:
def handle_missing_values_categorical(df,features):
    """
    this algorithm does the following
    - remove columns with x percentage of missing values
    - fill the missing values with the mode
    returns:
        - df
        - percentage of missing values
    """
    missing_percentage = round((df.isnull().sum().sum()/\
            reduce(lambda x, y: x*y, df.shape))*100,2)
    for key in features:
        df[key] = df[key].fillna(df[key].mode()[0])
    return missing_percentage, df

In [16]:
handle_missing_values_categorical(experience_df,categorical_features)

KeyError: 'Start'

In [7]:
def aggregation_cols(df,col_1,col_2,trim=False):
    
    grouped = df.groupby(col_1).agg({col_2: [min, max, mean]}) 
    grouped.columns = ["_".join(x) for x in grouped.columns.ravel()]
    if trim:
        return grouped.describe()
    return grouped

In [None]:
duration_aggregation = aggregation_cols(app_df,'customer','rtt')
top_customers_duration = duration_aggregation.sort_values(by='duration_max', ascending=False)
top_customers_duration.head(10)