## User Experience Analytics

### Import Liberaries

In [1]:
import pickle
import numpy as np
import pandas as pd
from math import floor
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy.stats import zscore
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, normalize
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt
import sys, os

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option("expand_frame_repr", False)
pd.set_option('display.float_format', '{:.2f}'.format)

In [3]:
sys.path.append(os.path.abspath(os.path.join('../scripts')))
from data_visualizer import *
from data_selector import *
from outlier_handler import OutlierHandler

### Loading Data

In [4]:
df1 = pd.read_csv("../data/my_clean_data.csv")
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146887 entries, 0 to 146886
Data columns (total 55 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   Unnamed: 0                        146887 non-null  int64  
 1   Bearer Id                         146887 non-null  int64  
 2   Start                             146887 non-null  object 
 3   Start ms                          146887 non-null  float64
 4   End                               146887 non-null  object 
 5   End ms                            146887 non-null  float64
 6   IMSI                              146887 non-null  int64  
 7   MSISDN/Number                     146887 non-null  int64  
 8   IMEI                              146887 non-null  int64  
 9   Last Location Name                146887 non-null  object 
 10  Avg RTT DL (ms)                   146887 non-null  float64
 11  Avg RTT UL (ms)                   146887 non-null  f

1. Aggregate, per customer, the following information (treat missing & outliers by replacing by the mean or the mode of the corresponding variable):
-	Average TCP retransmission
-	Average RTT
-	Handset type
-	Average throughput


In [5]:
user_experience = df1[[
    "MSISDN/Number",
    "Avg RTT DL (ms)",
    "Avg RTT UL (ms)",
    "Avg Bearer TP DL (kbps)",
    "Avg Bearer TP UL (kbps)",
    "TCP DL Retrans. Vol (Bytes)",
    "TCP UL Retrans. Vol (Bytes)",
    "Handset Type"]].copy()

In [6]:
user_experience['total_avg_rtt'] = user_experience['Avg RTT DL (ms)'] + user_experience['Avg RTT UL (ms)']
user_experience['total_avg_tp'] = user_experience['Avg Bearer TP DL (kbps)'] + user_experience['Avg Bearer TP UL (kbps)']
user_experience['total_avg_tcp'] = user_experience['TCP DL Retrans. Vol (Bytes)'] + user_experience['TCP UL Retrans. Vol (Bytes)']
user_experience.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146887 entries, 0 to 146886
Data columns (total 11 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   MSISDN/Number                146887 non-null  int64  
 1   Avg RTT DL (ms)              146887 non-null  float64
 2   Avg RTT UL (ms)              146887 non-null  float64
 3   Avg Bearer TP DL (kbps)      146887 non-null  float64
 4   Avg Bearer TP UL (kbps)      146887 non-null  float64
 5   TCP DL Retrans. Vol (Bytes)  146887 non-null  float64
 6   TCP UL Retrans. Vol (Bytes)  146887 non-null  float64
 7   Handset Type                 146887 non-null  object 
 8   total_avg_rtt                146887 non-null  float64
 9   total_avg_tp                 146887 non-null  float64
 10  total_avg_tcp                146887 non-null  float64
dtypes: float64(9), int64(1), object(1)
memory usage: 12.3+ MB


In [7]:
user_experience.head()

Unnamed: 0,MSISDN/Number,Avg RTT DL (ms),Avg RTT UL (ms),Avg Bearer TP DL (kbps),Avg Bearer TP UL (kbps),TCP DL Retrans. Vol (Bytes),TCP UL Retrans. Vol (Bytes),Handset Type,total_avg_rtt,total_avg_tp,total_avg_tcp
0,33664962239,42.0,5.0,23.0,44.0,19520.0,7230.0,Samsung Galaxy A5 Sm-A520F,47.0,67.0,26750.0
1,33681854413,65.0,5.0,16.0,26.0,19520.0,7230.0,Samsung Galaxy J5 (Sm-J530),70.0,42.0,26750.0
2,33760627129,65.0,5.0,6.0,9.0,19520.0,7230.0,Samsung Galaxy A8 (2018),70.0,15.0,26750.0
3,33750343200,65.0,5.0,44.0,44.0,19520.0,7230.0,undefined,70.0,88.0,26750.0
4,33699795932,65.0,5.0,6.0,9.0,19520.0,7230.0,Samsung Sm-G390F,70.0,15.0,26750.0


In [8]:
_user_experience = user_experience.groupby('MSISDN/Number').agg({
    'total_avg_rtt': 'sum',
    'total_avg_tp': 'sum',
    'total_avg_tcp': 'sum',
    'Handset Type': [lambda x: x.mode()[0]]})

user_experience = pd.DataFrame(columns=[
    "total_avg_rtt",
    "total_avg_tp",
    "total_avg_tcp",
    "Handset Type"])

user_experience["total_avg_rtt"] = _user_experience["total_avg_rtt"]['sum']
user_experience["total_avg_tp"] = _user_experience["total_avg_tp"]['sum']
user_experience["total_avg_tcp"] = _user_experience["total_avg_tcp"]['sum']
user_experience["Handset Type"] = _user_experience["Handset Type"]['<lambda>']
user_experience.head()

Unnamed: 0_level_0,total_avg_rtt,total_avg_tp,total_avg_tcp,Handset Type
MSISDN/Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
33601001722,46.0,76.0,2895381.0,Huawei P20 Lite Huawei Nova 3E
33601001754,31.0,99.0,7685845.5,Apple iPhone 7 (A1778)
33601002511,59.0,97.0,4150403.0,undefined
33601007832,84.0,248.0,2396.0,Apple iPhone 5S (A1457)
33601008617,119.0,43204.5,8047274.5,Apple iPhone Se (A1723)


2. Compute & list 10 of the top, bottom and most frequent:
- TCP values in the dataset. 
- RTT values in the dataset.
- Throughput values in the dataset


In [9]:
#TCP values in the dataset.
sorted_by_tcp = user_experience.sort_values(
    'total_avg_tcp', ascending=False)
top_10 = sorted_by_tcp.head(10)['total_avg_tcp']
last_10 = sorted_by_tcp.tail(10)['total_avg_tcp']
most_10 = user_experience['total_avg_tcp'].value_counts().head(10)


In [10]:
from IPython.display import Image
import plotly.io as pio

In [None]:
mult_hist([top_10, last_10, most_10], 1,
          3, "TCP values in the dataset", ['Top 10', 'Last 10', 'Most 10'])

In [None]:
#RTT values in the dataset. 
sorted_by_rtt = user_experience.sort_values(
    'total_avg_rtt', ascending=False)
top_10 = sorted_by_rtt.head(10)['total_avg_rtt']
last_10 = sorted_by_rtt.tail(10)['total_avg_rtt']
most_10 = user_experience['total_avg_rtt'].value_counts().head(10)

In [None]:

mult_hist([top_10, last_10, most_10, top_10], 1,
          3, "RTT values in the dataset", ['Top 10', 'Last 10', 'Most 10'])

In [None]:
#Throughput values in the dataset
sorted_by_tp = user_experience.sort_values(
    'total_avg_tp', ascending=False)
top_10 = sorted_by_tp.head(10)['total_avg_tp']
last_10 = sorted_by_tp.tail(10)['total_avg_tp']
most_10 = user_experience['total_avg_tp'].value_counts().head(10)

In [None]:
mult_hist([top_10, last_10, most_10, top_10], 1,
          3, "TCP values in the dataset", ['Top 10', 'Last 10', 'Most 10'])

3. Compute & report:
- The distribution of the average throughput  per handset type and provide interpretation for your findings.
- The average TCP retransmission view per handset type and provide interpretation for your findings.
 

In [None]:
handset_type_df = user_experience.groupby('Handset Type').agg(
    {'total_avg_tp': 'mean', 'total_avg_tcp': 'mean'})
handset_type_df.head()

In [None]:
sorted_by_tp = handset_type_df.sort_values(
    'total_avg_tp', ascending=False)
top_tp = sorted_by_tp['total_avg_tp']

In [None]:
hist(top_tp)

In [None]:
hist(top_tp.head(20))

- We can see modern phones have higher throughput.

In [None]:
#The average TCP retransmission view per handset type and provide interpretation for your findings.
sorted_by_tcp = handset_type_df.sort_values(
    'total_avg_tcp', ascending=False)
top_tcp = sorted_by_tcp['total_avg_tcp']

In [None]:
hist(top_tcp)

In [None]:
hist(top_tcp.head(20))

4. Using the experience metrics above, perform a k-means clustering (where k = 3) to segment users into groups of experiences and provide a brief description of each cluster. (The description must define each group based on your understanding of the data)

In [None]:
experiance_metric_df = user_experience[[
    "total_avg_rtt",
    "total_avg_tp",
    "total_avg_tcp"]].copy()
experiance_metric_df.head()

In [None]:
experiance_metric_df.plot.box()

In [None]:
df_outliers = OutlierHandler(experiance_metric_df)
df_outliers.getOverview(['total_avg_rtt','total_avg_tp','total_avg_tcp'])

In [None]:
df_outliers.replace_outliers_with_fences(
    ["total_avg_rtt",
     "total_avg_tp",
     "total_avg_tcp"])
#df_outliers = DfOutlier(experiance_metric_df)
df_outliers.getOverview(["total_avg_rtt",
     "total_avg_tp",
     "total_avg_tcp"])

In [None]:
experiance_metric_df.plot.box()

In [None]:
scaler = StandardScaler()
scaled_array = scaler.fit_transform(df_outliers.df)
scaled_array
pd.DataFrame(scaled_array).head(5)

In [None]:
data_normalized = normalize(scaled_array)
pd.DataFrame(data_normalized).head(5)

In [None]:
kmeans = KMeans(n_clusters=3, random_state=0).fit(data_normalized)
kmeans.labels_

In [None]:
experiance_metric_df.insert(0, 'cluster', kmeans.labels_)
experiance_metric_df

In [None]:
experiance_metric_df['cluster'].value_counts()

In [None]:
fig = px.scatter(experiance_metric_df, x='total_avg_rtt', y="total_avg_tp",
                 color='cluster', size='total_avg_tcp')
fig.show()

In [None]:
import plotly.graph_objects as go

# Assuming experiance_metric_df is your DataFrame
fig = go.Figure(data=[go.Scatter3d(x=experiance_metric_df['total_avg_tcp'],
                                    y=experiance_metric_df['total_avg_rtt'],
                                    z=experiance_metric_df['total_avg_tp'],
                                    mode='markers',
                                    marker=dict(color=experiance_metric_df['cluster']))])

fig.update_layout(scene=dict(xaxis_title='Total Average TCP',
                             yaxis_title='Total Average RTT',
                             zaxis_title='Total Average TP'),
                  width=900,  # Set the width of the plot
                  height=800,  # Set the height of the plot
                  title='3D Scatter Plot')

fig.show()
  

In [None]:
# Save the final data frame
user_experience.to_csv('../data/TellCo_user_experience_data.csv')

In [None]:
with open("../models/TellCo_user_experiance.pkl", "wb") as f:
    pickle.dump(kmeans, f)