## Telecom User Data Analysis

### Libraries

In [7]:
import pandas as pd
from functools import reduce

### Data Preprocessing

In [90]:
df = pd.read_excel("../data/telcom.xlsx",engine='openpyxl')

In [91]:
df.head()

Unnamed: 0,Bearer Id,Start,Start ms,End,End ms,Dur. (ms),IMSI,MSISDN/Number,IMEI,Last Location Name,...,Youtube DL (Bytes),Youtube UL (Bytes),Netflix DL (Bytes),Netflix UL (Bytes),Gaming DL (Bytes),Gaming UL (Bytes),Other DL (Bytes),Other UL (Bytes),Total UL (Bytes),Total DL (Bytes)
0,13114483460844900352,2019-04-04 12:01:18,770.0,2019-04-25 14:35:31,662.0,1823652.0,208201400000000.0,33664960000.0,35521210000000.0,9164566995485190,...,15854611.0,2501332.0,8198936.0,9656251.0,278082303.0,14344150.0,171744450.0,8814393.0,36749741.0,308879636.0
1,13114483482878900224,2019-04-09 13:04:04,235.0,2019-04-25 08:15:48,606.0,1365104.0,208201900000000.0,33681850000.0,35794010000000.0,L77566A,...,20247395.0,19111729.0,18338413.0,17227132.0,608750074.0,1170709.0,526904238.0,15055145.0,53800391.0,653384965.0
2,13114483484080500736,2019-04-09 17:42:11,1.0,2019-04-25 11:58:13,652.0,1361762.0,208200300000000.0,33760630000.0,35281510000000.0,D42335A,...,19725661.0,14699576.0,17587794.0,6163408.0,229584621.0,395630.0,410692588.0,4215763.0,27883638.0,279807335.0
3,13114483485442799616,2019-04-10 00:31:25,486.0,2019-04-25 07:36:35,171.0,1321509.0,208201400000000.0,33750340000.0,35356610000000.0,T21824A,...,21388122.0,15146643.0,13994646.0,1097942.0,799538153.0,10849722.0,749039933.0,12797283.0,43324218.0,846028530.0
4,13114483499480700928,2019-04-12 20:10:23,565.0,2019-04-25 10:40:32,954.0,1089009.0,208201400000000.0,33699800000.0,35407010000000.0,D88865A,...,15259380.0,18962873.0,17124581.0,415218.0,527707248.0,3529801.0,550709500.0,13910322.0,38542814.0,569138589.0


In [6]:
### Check for sufficiency of data to answer objectives by reviewing the objective and 
### available data. And check the number of observations and required sample size.

In [92]:
df.columns.to_list()

['Bearer Id',
 'Start',
 'Start ms',
 'End',
 'End ms',
 'Dur. (ms)',
 'IMSI',
 'MSISDN/Number',
 'IMEI',
 'Last Location Name',
 'Avg RTT DL (ms)',
 'Avg RTT UL (ms)',
 'Avg Bearer TP DL (kbps)',
 'Avg Bearer TP UL (kbps)',
 'TCP DL Retrans. Vol (Bytes)',
 'TCP UL Retrans. Vol (Bytes)',
 'DL TP < 50 Kbps (%)',
 '50 Kbps < DL TP < 250 Kbps (%)',
 '250 Kbps < DL TP < 1 Mbps (%)',
 'DL TP > 1 Mbps (%)',
 'UL TP < 10 Kbps (%)',
 '10 Kbps < UL TP < 50 Kbps (%)',
 '50 Kbps < UL TP < 300 Kbps (%)',
 'UL TP > 300 Kbps (%)',
 'HTTP DL (Bytes)',
 'HTTP UL (Bytes)',
 'Activity Duration DL (ms)',
 'Activity Duration UL (ms)',
 'Dur. (ms).1',
 'Handset Manufacturer',
 'Handset Type',
 'Nb of sec with 125000B < Vol DL',
 'Nb of sec with 1250B < Vol UL < 6250B',
 'Nb of sec with 31250B < Vol DL < 125000B',
 'Nb of sec with 37500B < Vol UL',
 'Nb of sec with 6250B < Vol DL < 31250B',
 'Nb of sec with 6250B < Vol UL < 37500B',
 'Nb of sec with Vol DL < 6250B',
 'Nb of sec with Vol UL < 1250B',
 'Socia

In [93]:
df.shape

(150001, 55)

In [12]:
### Check for timeliness of the data

In [10]:
### checking for duplicates

In [13]:
### Check for column organization and naming and correct them.

In [59]:
### Check for missing values and either fill them with a value that will, in no way, 
### affect the analysis or remove the record.

In [94]:
def handle_missing_values(df,x):
    """
    this algorithm does the following
    - remove columns with x percentage of missing values
    - fill the missing values with the mean
    returns:
        - df
        - percentage of missing values
    """
    missing_percentage = round((df.isnull().sum().sum()/\
            reduce(lambda x, y: x*y, df.shape))*100,2)
    cols_fill,cols_out = [],[]
    null_cols = df.isnull().sum().to_dict()
    for key,val in null_cols.items():
        if val/df.shape[0] > x:
            cols_out.append(key)
        elif val > 0 and df[key].dtype.kind in 'biufc':
            cols_fill.append(key)
    df.drop(cols_to_remove, axis=1)
    for i in df.columns:
        if i in cols_fill:
            df.fillna(df[i].mean().round(1), inplace=True)
    return missing_percentage, df

In [97]:
# test the function
missing_percentage,df = handle_missing_values(df,x=0.3)
True in df.isna().any().to_list()

False

In [15]:
### Check for misspellings, Inconsistent capitalization, and Incorrect punctuation and other typos and 
### make corrections.

In [16]:
### Check for correct character patterns like length and type of characters and make corrections.

In [17]:
### Check for values that don't make sense given knowledge of the field and make corrections.

In [18]:
### Check for value ranges and make corrections.

In [19]:
### Check for repeating or skipping patterns in variables whose values must be in sequence 
### or whose values must be unique and make corrections

In [18]:
# Identifying the top 10 handsets used by the customers.

In [19]:
# Identifying the top 3 handset manufacturers

In [None]:
# Identify the top 5 handsets per top 3 handset manufacturer

In [20]:
# Aggregation per user the following information in the column  
# number of xDR sessions
# Session duration
# the total download (DL) and upload (UL) data
# the total data volume (in Bytes) during this session for each application
