# Plan:
    I - Quelles sont les trackers les plus utilisés ?
    II - Quelles sont les catégories de trackers les plus représentés ?

# I - Quelles sont les trackers les plus utilisés ?

In [313]:
import pandas
import ast
import plotly.express as px 

In [314]:
applications = pandas.read_csv(
    "EP_Data_Extended.csv", quotechar='"', skipinitialspace=True, low_memory=False
)
data_trackers = pandas.read_csv(
    "data_trackers.csv", quotechar='"', skipinitialspace=True, low_memory=False
)

In [315]:
handles = applications.loc[:, "handle"] # get "handles" column
trackers = applications.loc[:, "trackers"] # get "trackers" column

print("All handles")
print(handles)
print("\nAll trackers")
print(trackers)

All handles
0                                           com.semitan.tan
1                                   com.allocine.androidapp
2                                  com.frostnerd.dnschanger
3         com.fullsix.android.labanquepostale.accountaccess
4                                             fr.axa.monaxa
                                ...                        
195302                      com.iudesk.android.photo.editor
195303                                        org.wikipedia
195304                              com.mobidia.android.mdm
195305                                     com.metago.astro
195306                               net.slickdeals.android
Name: handle, Length: 195307, dtype: object

All trackers
0                                                48,105,312
1                                                      63,7
2                                                       NaN
3                                                       NaN
4                             

In [316]:
# convert "handles" from type <class 'pandas.core.series.Series'> to <class 'list'>
handles = handles.values.tolist()

# convert "trackers" from type <class 'pandas.core.series.Series'> to <class 'list'>
trackers = trackers.values.tolist()

In [317]:
# separate trackers and permission in 2 differents dictionary

dict_trackers = {}
dict_permissions = {}

for el in trackers:
    if type(el) == str:
        el = el.split(",") # split the string into a list using comma as a separator
        for nb in el:
            if nb and nb.isdigit() and nb in dict_trackers:
                dict_trackers[nb] += 1 # incremente the nb of tracker
            elif nb and nb.isdigit():
                dict_trackers[nb] = 1 # initialize the nb of tracker
            elif "permission" in nb and nb in dict_permissions:
                dict_permissions[nb] += 1 # incremente the nb of permission
            else:
                dict_permissions[nb] = 1 # initialize the nb of permission


In [318]:
# converting into list of tuple
tuple_trackers = [(k, v) for k, v in dict_trackers.items()]
result = {}

for track in tuple_trackers:
    if track[0] != "000":
        key = ast.literal_eval(data_trackers[track[0]][0]) # convert str to dictionary

        # merge all google trackers
        if "Google" in key["name"] or "com.google" in key["network_signature"]:
            if "Google" in result.keys():
                result["Google"] += track[1] # incremente the nb of Google tracker
            else:
                result["Google"] = track[1] # initialize the nb of Google tracker

        # merge all facebook trackers
        elif "Facebook" in key["name"] or "com.facebook" in key["network_signature"]:
            if "Facebook" in result.keys():
                result["Facebook"] += track[1] # incremente the nb of Fb tracker
            else:
                result["Facebook"] = track[1] # initialize the nb of Fb tracker
        else:
            key = key["name"]
            result[key] = track[1]

print(pandas.DataFrame(result.items(), columns=['Name', 'Occurrence']))

                  Name  Occurrence
0               Google      365191
1    Mobile Engagement          72
2                Smart        1521
3            Ad4Screen        1171
4             Weborama          29
..                 ...         ...
376              Bolts          68
377               Vpon           4
378      Treasure Data          12
379              Pendo          33
380            Plexure           1

[381 rows x 2 columns]


In [319]:
# sort dictionary by most used trackers
ordered_result = {k: v for k, v in sorted(result.items(), key=lambda item: item[1], reverse=True)}

In [320]:
# get n first element of a dict
def getNFirstElement(dict, n):
    cropped_dict = {}
    for i in range(n):
        for k,v in dict.items():
            if len(cropped_dict) == n:
                return cropped_dict
            cropped_dict[k] = v 

## Liste des 20 trackers les plus utilisés

In [321]:
# get top 20 most used trackers
NFirst_Result = getNFirstElement(ordered_result, 20)
most_used_trackers = (pandas.DataFrame(NFirst_Result.items(), columns=['Name', 'Occurrence']))
print(most_used_trackers)

                            Name  Occurrence
0                         Google      365191
1                       Facebook      193197
2                         Flurry       17663
3                      AppsFlyer       16051
4                         Inmobi       15840
5                           Moat       15131
6                    Unity3d Ads       14929
7                  Twitter MoPub       13844
8   AppLovin (MAX and SparkLabs)       11686
9                         Adjust       11362
10                      AdColony        9266
11                        Vungle        9012
12                    ironSource        8136
13           Integral Ad Science        7615
14          IAB Open Measurement        7594
15          Amazon Advertisement        7101
16                     OneSignal        6696
17                        Branch        6677
18                    ChartBoost        6435
19                        Tapjoy        5722


## Histogrammes des trackers les plus utilisés

In [322]:
# get histo
fig = px.histogram(x=list(NFirst_Result.keys()), y= list(NFirst_Result.values()), color=list(NFirst_Result.keys()), title="Histogramme des trackers les plus utilisés")
fig.show()

## Indice de Herfindahl Hirschmann

In [323]:
market_shares = []
total_market = sum(result.values()) # get total nb of trackers
for nb in result.values():
    mk = ((nb/total_market)*100)
    mk**=2
    market_shares.append(mk) # append all market shares in a list
IHH_index = sum(market_shares)

print("IHH index :", IHH_index)

IHH index : 1994.731223100649


IHH est proche de 2000, la concentration de marché est élevé.

# II - Quelles sont les catégories de trackers les plus représentés ?

In [324]:
# get pie chart of each categories market share

catResult = {}

for k, v in tuple_trackers:
    if k != "000":
        key = ast.literal_eval(data_trackers[k][0]) # convert str to dictionary
        for categories in key["categories"]: # loop in categories to count their occurrence

            if categories and categories in catResult.keys():
                catResult[categories] += 1
            else:
                catResult[categories] = 1

# nb of times each category of tracker is used
print(catResult)

{'Analytics': 182, 'Advertisement': 103, 'Crash reporting': 10, 'Identification': 20, 'Profiling': 47, 'Location': 56}


## Diagramme circulaire : répartition des catégories de trackers 

In [325]:
# get pie chart
fig = px.pie(values=list(catResult.values()), names=list(catResult.keys()))
fig.show()

In [326]:

result = {}
date = {}
for k,v in tuple_trackers:
    if k != "000":
        key = ast.literal_eval(data_trackers[k][0])
        date[key["name"]] = key["creation_date"]

dico = {}
dico_occurrence = {}
for name in most_used_trackers.loc[:, "Name"]:
    for k, v in date.items():
        if "Google" in k:
            dico["Google"] = "2017-09-24"
        elif "Facebook" in k:
            dico["Facebook"] = "2017-12-05"
        elif name in k:
            dico[k] = date[k]

df = pandas.DataFrame(dico.items(), columns=['Name', 'Date'])
print(df)




                            Name        Date
0                         Google  2017-09-24
1                       Facebook  2017-12-05
2                         Flurry  2017-09-24
3                      AppsFlyer  2017-09-24
4                         Inmobi  2018-03-04
5                           Moat  2017-12-03
6                    Unity3d Ads  2018-03-04
7                  Twitter MoPub  2017-09-27
8   AppLovin (MAX and SparkLabs)  2018-01-05
9                         Adjust  2017-12-03
10                      AdColony  2018-03-04
11                        Vungle  2019-03-10
12                    ironSource  2018-08-16
13           Integral Ad Science  2019-10-26
14          IAB Open Measurement  2020-09-19
15          Amazon Advertisement  2018-03-04
16                     OneSignal  2019-04-15
17                        Branch  2019-03-10
18                    ChartBoost  2017-12-03
19                        Tapjoy  2019-04-15


In [327]:
del most_used_trackers["Name"]
frames = [df, most_used_trackers]
resultat = pandas.concat(frames, axis=1)

fig = px.histogram(resultat, x=resultat["Date"], y=resultat["Occurrence"], color=resultat["Name"])
fig.show()
