# Import Modules

In [11]:
import pandas as pd
import math
import pickle
import numpy as np
from sqlalchemy import create_engine
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Read  CSV Data


In [12]:
# get user engagement data
telecom_engagement_df = pd.read_csv("../data/telecom_user_engagement_data.csv")
telecom_engagement_df.head(5)

Unnamed: 0,MSISDN/Number,Cluster,number of xDR Sessions,Dur (ms),Total Data Volume (Bytes)
0,33601001722,2,1,116720140.0,878690600.0
1,33601001754,0,1,181230963.0,156859600.0
2,33601007832,0,1,49878024.0,422320700.0
3,33601008617,1,2,37104453.0,1457411000.0
4,33601010682,1,2,253983077.0,615217200.0


In [14]:
telecom_experience_df = pd.read_csv("../data/telecom_user_experience_data.csv")
telecom_experience_df.head(5)

Unnamed: 0,MSISDN/Number,Cluster,Total Avg RTT (ms),Total Avg Bearer TP (kbps),Total TCP Retrans. Vol (Bytes)
0,33601001722,0,46.0,76.0,21569570.0
1,33601001754,0,31.0,99.0,21569570.0
2,33601007832,0,84.0,248.0,760724.7
3,33601008617,1,119.0,56844.0,30940400.0
4,33601010682,0,151.458589,7908.0,22331990.0


In [15]:
with open("../models/telecom_user_engagement.pkl", "rb") as f:
    kmeans1 = pickle.load(f)

In [16]:
with open("../models/telecom_user_experience.pkl", "rb") as f:
    kmeans2 = pickle.load(f)

# K-means Clustering


In [17]:
# choose relevant columns and scale the data
engagement_df = telecom_engagement_df.set_index('MSISDN/Number')[['number of xDR Sessions', 'Dur (ms)', 'Total Data Volume (Bytes)']]
scaler = StandardScaler()
scale_data = scaler.fit_transform(engagement_df)
scale_data

array([[-0.49203077, -0.15370369,  0.3859319 ],
       [-0.49203077,  0.20223377, -1.09205693],
       [-0.49203077, -0.52250403, -0.54851071],
       ...,
       [-0.49203077, -0.74909242,  0.05003262],
       [-0.49203077, -0.01980251, -0.42977986],
       [-0.49203077,  0.59838496, -0.19230338]])

In [18]:
normalized_data = normalize(scale_data)
normalized_data

array([[-0.76409062, -0.23869147,  0.59932622],
       [-0.40505194,  0.16648386, -0.89900837],
       [-0.54469765, -0.57843276, -0.60722319],
       ...,
       [-0.5481452 , -0.83452384,  0.05573866],
       [-0.75279737, -0.03029746, -0.6575547 ],
       [-0.61641759,  0.74965842, -0.24091824]])

In [19]:
less_engaged_cluster = 3
distance = kmeans1.fit_transform(normalized_data)
distance_from_less_engagement = list(
    map(lambda x: x[less_engaged_cluster], distance))
telecom_engagement_df['engagement_score'] = distance_from_less_engagement
telecom_engagement_df.head(5)

Unnamed: 0,MSISDN/Number,Cluster,number of xDR Sessions,Dur (ms),Total Data Volume (Bytes),engagement_score
0,33601001722,2,1,116720140.0,878690600.0,0.623012
1,33601001754,0,1,181230963.0,156859600.0,1.207094
2,33601007832,0,1,49878024.0,422320700.0,1.218189
3,33601008617,1,2,37104453.0,1457411000.0,1.494503
4,33601010682,1,2,253983077.0,615217200.0,1.673454


In [20]:
experience_df = telecom_experience_df.set_index('MSISDN/Number')[['Total Avg RTT (ms)', 'Total Avg Bearer TP (kbps)', 'Total TCP Retrans. Vol (Bytes)']]
scaler = StandardScaler()
scale_data = scaler.fit_transform(experience_df)
scale_data

array([[-0.18757153, -0.49997097, -0.05714637],
       [-0.20961963, -0.49943776, -0.05714637],
       [-0.13171635, -0.49598343, -0.18790118],
       ...,
       [-0.17875229, -0.26143688, -0.19131117],
       [-0.19639077, -0.49904364, -0.05714637],
       [-0.06783773, -0.50096786, -0.05714637]])

In [21]:
normalized_data = normalize(scale_data)
normalized_data

array([[-0.34926448, -0.93096271, -0.10640846],
       [-0.38486999, -0.9169876 , -0.10492302],
       [-0.2410207 , -0.90757354, -0.34383031],
       ...,
       [-0.48311151, -0.70658207, -0.51705421],
       [-0.3641367 , -0.92529858, -0.10595758],
       [-0.13333943, -0.98468458, -0.11232488]])

In [22]:
worst_experience_cluster = 1
distance = kmeans2.fit_transform(normalized_data)
distance_from_worst_experience_cluster = list(
    map(lambda x: x[worst_experience_cluster], distance))
telecom_experience_df['experience_score'] = distance_from_worst_experience_cluster
telecom_experience_df.head()

Unnamed: 0,MSISDN/Number,Cluster,Total Avg RTT (ms),Total Avg Bearer TP (kbps),Total TCP Retrans. Vol (Bytes),experience_score
0,33601001722,0,46.0,76.0,21569570.0,1.801543
1,33601001754,0,31.0,99.0,21569570.0,1.792665
2,33601007832,0,84.0,248.0,760724.7,1.776636
3,33601008617,1,119.0,56844.0,30940400.0,0.213625
4,33601010682,0,151.458589,7908.0,22331990.0,1.836936


In [23]:
user_satisfaction_df = pd.merge(telecom_engagement_df, telecom_experience_df, on='MSISDN/Number')
user_satisfaction_df['satisfaction_score'] = (
    user_satisfaction_df['engagement_score'] + user_satisfaction_df['experience_score'])/2
user_satisfaction_df.head()

Unnamed: 0,MSISDN/Number,Cluster_x,number of xDR Sessions,Dur (ms),Total Data Volume (Bytes),engagement_score,Cluster_y,Total Avg RTT (ms),Total Avg Bearer TP (kbps),Total TCP Retrans. Vol (Bytes),experience_score,satisfaction_score
0,33601001722,2,1,116720140.0,878690600.0,0.623012,0,46.0,76.0,21569570.0,1.801543,1.212278
1,33601001754,0,1,181230963.0,156859600.0,1.207094,0,31.0,99.0,21569570.0,1.792665,1.49988
2,33601007832,0,1,49878024.0,422320700.0,1.218189,0,84.0,248.0,760724.7,1.776636,1.497412
3,33601008617,1,2,37104453.0,1457411000.0,1.494503,1,119.0,56844.0,30940400.0,0.213625,0.854064
4,33601010682,1,2,253983077.0,615217200.0,1.673454,0,151.458589,7908.0,22331990.0,1.836936,1.755195


In [24]:
user_satisfaction_df = user_satisfaction_df[['MSISDN/Number', 'engagement_score',
                        'experience_score', 'satisfaction_score']]
user_satisfaction_df.set_index('MSISDN/Number', inplace=True)
user_satisfaction_df.head()

Unnamed: 0_level_0,engagement_score,experience_score,satisfaction_score
MSISDN/Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
33601001722,0.623012,1.801543,1.212278
33601001754,1.207094,1.792665,1.49988
33601007832,1.218189,1.776636,1.497412
33601008617,1.494503,0.213625,0.854064
33601010682,1.673454,1.836936,1.755195


In [25]:
user_satisfaction_df.sort_values('satisfaction_score', ascending=False).head(10)


Unnamed: 0_level_0,engagement_score,experience_score,satisfaction_score
MSISDN/Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
33770416232,1.885466,1.876769,1.881117
33626211357,1.884858,1.876313,1.880586
33662711089,1.883601,1.877297,1.880449
33698176002,1.887336,1.872749,1.880043
33698729715,1.887272,1.87225,1.879761
33644315244,1.884971,1.873601,1.879286
33630302633,1.887014,1.871249,1.879132
33644042541,1.881684,1.876123,1.878903
33663801778,1.886204,1.871586,1.878895
33760709252,1.882465,1.87487,1.878667


In [26]:
X = user_satisfaction_df[['engagement_score', 'experience_score']]
y = user_satisfaction_df[['satisfaction_score']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [27]:
y_pred = model.predict(X_test)


In [28]:
user_satisfaction_df1 = user_satisfaction_df[['engagement_score', 'experience_score']]
user_satisfaction_df1

Unnamed: 0_level_0,engagement_score,experience_score
MSISDN/Number,Unnamed: 1_level_1,Unnamed: 2_level_1
33601001722,0.623012,1.801543
33601001754,1.207094,1.792665
33601007832,1.218189,1.776636
33601008617,1.494503,0.213625
33601010682,1.673454,1.836936
...,...,...
33789960306,0.258757,1.840268
33789967113,1.197874,1.805538
33789996170,1.142288,1.643293
33789997247,0.938467,1.797908


In [29]:
scaler = StandardScaler()
scale_data = scaler.fit_transform(user_satisfaction_df1)
scale_data

array([[-1.50696108,  0.64001109],
       [ 0.10033715,  0.62618928],
       [ 0.13086922,  0.6012335 ],
       ...,
       [-0.07799983,  0.39363869],
       [-0.63888013,  0.63435118],
       [-1.2728974 ,  0.70046705]])

In [30]:
normalized_data = normalize(scale_data)
normalized_data

array([[-0.92042925,  0.39090919],
       [ 0.1582163 ,  0.98740448],
       [ 0.21268768,  0.97712023],
       ...,
       [-0.19437168,  0.98092795],
       [-0.70961751,  0.70458711],
       [-0.87610734,  0.4821161 ]])

In [31]:
kmeans = KMeans(n_clusters = 3, random_state = 42)
y_kmeans = kmeans.fit_predict(normalized_data)
X = np.array(normalized_data)
y_kmeans

array([2, 0, 0, ..., 2, 2, 2], dtype=int32)

In [32]:
clustered_tellco_satisfaction_df = user_satisfaction_df.copy()
clustered_tellco_satisfaction_df.insert(0, 'Cluster', y_kmeans)
clustered_tellco_satisfaction_df

Unnamed: 0_level_0,Cluster,engagement_score,experience_score,satisfaction_score
MSISDN/Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
33601001722,2,0.623012,1.801543,1.212278
33601001754,0,1.207094,1.792665,1.499880
33601007832,0,1.218189,1.776636,1.497412
33601008617,1,1.494503,0.213625,0.854064
33601010682,0,1.673454,1.836936,1.755195
...,...,...,...,...
33789960306,2,0.258757,1.840268,1.049512
33789967113,0,1.197874,1.805538,1.501706
33789996170,2,1.142288,1.643293,1.392790
33789997247,2,0.938467,1.797908,1.368187


In [33]:
clustered_tellco_satisfaction_df.groupby('Cluster').agg(
    {'satisfaction_score': 'mean', 'experience_score': 'mean'})

Unnamed: 0_level_0,satisfaction_score,experience_score
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1.572674,1.777362
1,0.886303,0.415087
2,1.292967,1.772404


In [35]:
engine = create_engine('mysql+pymysql://root:@localhost/tellco')

In [43]:
try:
    print('writing to the database')
    frame = clustered_tellco_satisfaction_df.to_sql(
        "telco_satisfaction_score", con=engine, if_exists='replace')
    print('successful')
except Exception as e:
  print("Error writing to database: ", e)

writing to the database
Error writing to database:  (pymysql.err.OperationalError) (2003, "Can't connect to MySQL server on 'localhost' ([Errno 111] Connection refused)")
(Background on this error at: https://sqlalche.me/e/14/e3q8)
