In [1]:
import warnings
from xml.etree.ElementInclude import include
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import pandas as pd
from functools import reduce
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from statistics import mean
import numpy as np
import statsmodels.api as sm


import matplotlib
plt.style.use('ggplot')

matplotlib.rcParams['figure.figsize'] = (12,8)

pd.options.mode.chained_assignment = None

import seaborn as sns



In the current dataset you’re expected to track the user’s engagement using the following engagement metrics: 
sessions frequency 
the duration of the session 
the sessions total traffic (download and upload (bytes))???
How do we solve this problem?
- we recognise a user as MSISDN/NUMBER
- group the users based on the data used on the apps,
- determine what parameters can help portray engagement


In [2]:
df = pd.read_csv("../data/telcom.csv")

In [55]:
app_df = pd.DataFrame({'customer':df['MSISDN/Number'],
                      'sessions_frequency':df['Bearer Id'],
                      'duration':df['Dur. (ms)']})
app_df['social_media_data'] = df['Social Media DL (Bytes)'] + df['Social Media UL (Bytes)']
app_df['google_data'] = df['Google DL (Bytes)'] + df['Google UL (Bytes)'] 
app_df['email_data'] = df['Email DL (Bytes)'] + df['Email UL (Bytes)'] 
app_df['youtube_data'] = df['Youtube DL (Bytes)'] + df['Youtube UL (Bytes)']
app_df['netflix_data'] = df['Netflix DL (Bytes)'] + df['Netflix UL (Bytes)']
app_df['gaming_data'] = df['Gaming DL (Bytes)'] + df['Gaming UL (Bytes)'] 
app_df['other_data'] = df['Other DL (Bytes)'] + df['Other UL (Bytes)']
app_df['total_data'] = df['Total UL (Bytes)'] + df['Total DL (Bytes)']

In [15]:
app_df.head()

Unnamed: 0,customer,sessions_frequency,duration,social_media_data,google_data,email_data,youtube_data,netflix_data,gaming_data,other_data,total_data
0,33664960000.0,1.31145e+19,1823652.0,1570185.0,2905912.0,3701304.0,18355943.0,17855187.0,292426453.0,180558843.0,345629377.0
1,33681850000.0,1.31145e+19,1365104.0,1933278.0,4414096.0,937385.0,39359124.0,35565545.0,609920783.0,541959383.0,707185356.0
2,33760630000.0,1.31145e+19,1361762.0,1726277.0,10229119.0,3363124.0,34425237.0,23751202.0,229980251.0,414908351.0,307690973.0
3,33750340000.0,1.31145e+19,1321509.0,657493.0,11811761.0,2070983.0,36534765.0,15092588.0,810387875.0,761837216.0,889352748.0
4,33699800000.0,1.31145e+19,1089009.0,912788.0,7748843.0,2110349.0,34222253.0,17539799.0,531237049.0,564619822.0,607681403.0


In [20]:
def aggregation_cols(df,col_1,col_2,trim=False):
    
    grouped = df.groupby(col_1).agg({col_2: [min, max, mean]}) 
    grouped.columns = ["_".join(x) for x in grouped.columns.ravel()]
    if trim:
        return grouped.describe()
    return grouped

In [37]:
duration_aggregation = aggregation_cols(app_df,'customer','duration')
top_customers_duration = duration_aggregation.sort_values(by='duration_max', ascending=False)
top_customers_duration.head(10)

Unnamed: 0_level_0,duration_min,duration_max,duration_mean
customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
33625780000.0,216668.0,1859336.0,1091397.0
33664960000.0,259557.0,1823652.0,1041604.0
33763550000.0,86399.0,1679813.0,871898.8
33669260000.0,1573420.0,1573420.0,1573420.0
33661720000.0,1480875.0,1480875.0,1480875.0
33607500000.0,1474663.0,1474663.0,1474663.0
33669150000.0,226538.0,1468680.0,765986.7
33662840000.0,146548.0,1392232.0,734918.9
33698700000.0,1036806.0,1382408.0,1209607.0
33762600000.0,1375612.0,1375612.0,1375612.0


In [38]:
sessions_aggregation = aggregation_cols(app_df,'customer','sessions_frequency')
top_customers_session = sessions_aggregation.sort_values(by='sessions_frequency_max', ascending=False)
top_customers_session.head(10)

Unnamed: 0_level_0,sessions_frequency_min,sessions_frequency_max,sessions_frequency_mean
customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
33665390000.0,1.31865e+19,1.31865e+19,1.31865e+19
33616110000.0,1.31145e+19,1.31865e+19,1.31505e+19
33666830000.0,1.31865e+19,1.31865e+19,1.31865e+19
33668600000.0,1.30424e+19,1.31865e+19,1.311448e+19
33632770000.0,1.31865e+19,1.31865e+19,1.31865e+19
33659410000.0,1.31865e+19,1.31865e+19,1.31865e+19
33666030000.0,1.31865e+19,1.31865e+19,1.31865e+19
33658690000.0,1.31865e+19,1.31865e+19,1.31865e+19
33666710000.0,1.31865e+19,1.31865e+19,1.31865e+19
33609660000.0,1.30424e+19,1.31865e+19,1.311445e+19


In [39]:
traffic_aggregation = aggregation_cols(app_df,'customer','total_data')
top_customers_traffic = traffic_aggregation.sort_values(by='total_data_max', ascending=False)
top_customers_traffic.head(10)

Unnamed: 0_level_0,total_data_min,total_data_max,total_data_mean
customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
33674920000.0,917345508.0,955984776.0,936665142.0
33664690000.0,129426890.0,952641334.0,530740848.0
33668550000.0,461875236.0,952307464.0,759167871.5
33658810000.0,292423647.0,951775502.0,622952450.0
33667830000.0,950760661.0,950760661.0,950760661.0
33610830000.0,493831975.0,949987157.0,721909566.0
33606670000.0,330304121.0,949598255.0,639951188.0
33662610000.0,949292055.0,949292055.0,949292055.0
33662050000.0,949172538.0,949172538.0,949172538.0
33668530000.0,573010442.0,948809765.0,760910103.5


concerning the duration, we realize that the top 10 customers range from 1800-1300 seconds
concerning the sessions, we realize that they remain consistent at 1.318650e+19 
concerning the data we realize that they range from 955984776-948809765 bytes of data

#### task 2Normalize each engagement metric and run a k-means (k=3) to classify customers in three groups of engagement
we can do this;
-generate pipelines
-transform data
-run k-means

In [41]:
def generate_pipeline(type_="numeric",x=1):
    pipeline = None
    if type_ == "numeric":
        pipeline = Pipeline(steps=[
            ('impute', SimpleImputer(strategy='mean')),
            ('scale', MinMaxScaler())
        ])
    elif type_ == "categorical":
        pipeline = Pipeline(steps=[
        ('impute', SimpleImputer(strategy='most_frequent')),
        ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse=False))
        ])
    else:
        pipeline = np.zeros(x)
    return pipeline

In [42]:
def store_features(df,type_,value):
    features = [None]
    if type_ == "numeric":
        features = df.select_dtypes(include=value).columns.tolist()
    elif type_ == "categorical":
        features = df.select_dtypes(exclude=value).columns.tolist()
    return features

In [43]:
def generate_transformation(pipeline,df,type_,value):
    transformation = None
    if type_=="numeric":
        transformation=pipeline.fit_transform(df.select_dtypes(include=value))
    elif type_ == "categorical":
        transformation=pipeline.fit_transform(df.select_dtypes(exclude=value))
    return transformation

In [45]:
def frame_transforms(transform,features):
    return pd.DataFrame(transform,columns=features)

In [63]:
application_pipeline = generate_pipeline(type_="numeric")
df_to_transform = app_df[app_df.columns.to_list()[1:]]
application_transformation = generate_transformation(application_pipeline,df_to_transform,"numeric","number")
application_transformed_df = frame_transforms(application_transformation,df_to_transform.columns)
application_transformed_df['customer'] = app_df['customer']
application_transformed_df.head()

Unnamed: 0,sessions_frequency,duration,social_media_data,google_data,email_data,youtube_data,netflix_data,gaming_data,other_data,total_data,customer
0,0.988515,0.980734,0.429842,0.185014,0.818893,0.405155,0.393722,0.340111,0.209932,0.3416,33664960000.0
1,0.988515,0.733164,0.529339,0.282389,0.206007,0.870743,0.786415,0.709765,0.630472,0.731616,33681850000.0
2,0.988515,0.73136,0.472615,0.657831,0.743904,0.761371,0.524455,0.267406,0.482631,0.300676,33760630000.0
3,0.988515,0.709627,0.179741,0.760013,0.457377,0.808134,0.332467,0.943166,0.886331,0.928123,33750340000.0
4,0.988515,0.5841,0.249699,0.497694,0.466107,0.756871,0.386729,0.618155,0.656841,0.62428,33699800000.0


In [64]:
from sklearn.cluster import KMeans

In [113]:
kmeans = KMeans(init="random",n_clusters=3,n_init=10,max_iter=300,random_state=42)
kmeans_df = application_transformed_df[['duration','sessions_frequency','total_data']]
y_pred = kmeans.fit_predict(kmeans_df)

In [114]:
# first group
kmeans_df[label==0]

Unnamed: 0,duration,sessions_frequency,total_data
10,0.454323,0.057472,0.671577
22,0.386877,0.068965,0.640209
32,0.320230,0.068965,0.938096
34,0.316284,0.068965,0.955345
49,0.264345,0.068965,0.529314
...,...,...,...
149981,0.041411,0.068965,0.707122
149982,0.042790,0.000000,0.709142
149983,0.050380,0.068965,0.807103
149996,0.040000,0.057472,0.650301


In [115]:
kmeans_df[label==1]

Unnamed: 0,duration,sessions_frequency,total_data
0,0.980734,0.988515,0.341600
1,0.733164,0.988515,0.731616
2,0.731360,0.988515,0.300676
3,0.709627,0.988515,0.928123
4,0.584100,0.988515,0.624280
...,...,...,...
149994,0.057447,0.988515,0.963528
149995,0.028315,0.977014,0.910226
149998,0.049189,0.988515,0.645873
149999,0.049006,0.988515,0.401893


In [116]:
kmeans_df[label==2]

Unnamed: 0,duration,sessions_frequency,total_data
15,0.417521,0.057472,0.077766
16,0.410637,0.068965,0.184069
28,0.371562,0.068965,0.137753
31,0.331920,0.068965,0.144132
37,0.321158,0.068965,0.196668
...,...,...,...
149976,0.040349,0.068965,0.117281
149984,0.045726,0.068965,0.323904
149988,0.047949,0.068965,0.136486
149989,0.042791,0.068965,0.362768
