# Importing Libraries

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load Dataset

In [2]:
traffic= pd.read_csv('traffic.csv')
traffic.head()

Unnamed: 0,event,date,country,city,artist,album,track,isrc,linkid
0,click,2021-08-21,Saudi Arabia,Jeddah,Tesher,Jalebi Baby,Jalebi Baby,QZNWQ2070741,2d896d31-97b6-4869-967b-1c5fb9cd4bb8
1,click,2021-08-21,Saudi Arabia,Jeddah,Tesher,Jalebi Baby,Jalebi Baby,QZNWQ2070741,2d896d31-97b6-4869-967b-1c5fb9cd4bb8
2,click,2021-08-21,India,Ludhiana,Reyanna Maria,So Pretty,So Pretty,USUM72100871,23199824-9cf5-4b98-942a-34965c3b0cc2
3,click,2021-08-21,France,Unknown,"Simone & Simaria, Sebastian Yatra",No Llores Más,No Llores Más,BRUM72003904,35573248-4e49-47c7-af80-08a960fa74cd
4,click,2021-08-21,Maldives,Malé,Tesher,Jalebi Baby,Jalebi Baby,QZNWQ2070741,2d896d31-97b6-4869-967b-1c5fb9cd4bb8


In [3]:
traffic.shape

(226278, 9)

- There are 226278 rows and 9 columns

In [4]:
traffic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226278 entries, 0 to 226277
Data columns (total 9 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   event    226278 non-null  object
 1   date     226278 non-null  object
 2   country  226267 non-null  object
 3   city     226267 non-null  object
 4   artist   226241 non-null  object
 5   album    226273 non-null  object
 6   track    226273 non-null  object
 7   isrc     219157 non-null  object
 8   linkid   226278 non-null  object
dtypes: object(9)
memory usage: 15.5+ MB


- Date column datatype is object so we need to convert it into date

## Converting the datatype of Date column from object to date

In [5]:
traffic['date']= pd.to_datetime(traffic['date'])

In [6]:
traffic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226278 entries, 0 to 226277
Data columns (total 9 columns):
 #   Column   Non-Null Count   Dtype         
---  ------   --------------   -----         
 0   event    226278 non-null  object        
 1   date     226278 non-null  datetime64[ns]
 2   country  226267 non-null  object        
 3   city     226267 non-null  object        
 4   artist   226241 non-null  object        
 5   album    226273 non-null  object        
 6   track    226273 non-null  object        
 7   isrc     219157 non-null  object        
 8   linkid   226278 non-null  object        
dtypes: datetime64[ns](1), object(8)
memory usage: 15.5+ MB


## Check for Missing Values

In [8]:
traffic.isna().sum()

event         0
date          0
country      11
city         11
artist       37
album         5
track         5
isrc       7121
linkid        0
dtype: int64

In [10]:
traffic.isna().sum()/len(traffic)*100

event      0.000000
date       0.000000
country    0.004861
city       0.004861
artist     0.016352
album      0.002210
track      0.002210
isrc       3.147014
linkid     0.000000
dtype: float64

- There are missing values in the data and missing percentage is less than 25% so we can impute the missing values

## Check for Duplicate Data

In [11]:
traffic.duplicated().sum()

103711

- There are 103711 duplicate values in the data so we need to drop these duplicates

In [15]:
traffic.drop_duplicates(inplace=True)

In [16]:
traffic.duplicated().sum()

0

- We have successfully handled duplicate data

In [17]:
traffic.shape

(122567, 9)

## Handling Missing Data

In [18]:
traffic.isna().sum()

event         0
date          0
country       5
city          5
artist       28
album         4
track         4
isrc       6306
linkid        0
dtype: int64

In [19]:
traffic['country'].value_counts()

country
United States     28664
India             18689
France            10565
Saudi Arabia       7682
United Kingdom     5095
                  ...  
Samoa                 2
Macao                 2
Afghanistan           2
Lesotho               1
Sint Maarten          1
Name: count, Length: 211, dtype: int64

In [20]:
traffic['country']= traffic['country'].fillna(traffic['country'].mode()[0])

In [21]:
traffic['city'].value_counts()

city
Unknown                8797
Jeddah                 2497
Riyadh                 2232
Hyderabad              1088
Dammam                 1002
                       ... 
Kentwood                  1
Saint-Come--Liniere       1
Pontedassio               1
Greater Noida             1
Soumagne                  1
Name: count, Length: 11993, dtype: int64

In [22]:
traffic['city']= traffic['city'].fillna(traffic['city'].mode()[0])

In [23]:
traffic['artist'].value_counts()

artist
Tesher                            8288
Anne-Marie                        4029
Tundra Beats                      3951
Roddy Ricch                       3107
Olivia Rodrigo                    3037
                                  ... 
Lynda                                1
Neutro Shorty                        1
Meridian Film Music Recordings       1
The Marvelettes                      1
Kiiara                               1
Name: count, Length: 2419, dtype: int64

In [24]:
traffic['artist']= traffic['artist'].fillna(traffic['artist'].mode()[0])

In [25]:
traffic['album'].value_counts()

album
Jalebi Baby                             8288
Beautiful                               4028
Beautiful Day                           3950
Late At Night                           3059
ily (i love you baby) (feat. Emilee)    2956
                                        ... 
غيمة                                       1
Ando Bien                                  1
Wish You Were Sober                        1
Energy                                     1
low kii savage                             1
Name: count, Length: 3254, dtype: int64

In [26]:
traffic['album']= traffic['album'].fillna(traffic['album'].mode()[0])

In [27]:
traffic['track'].value_counts()

track
Jalebi Baby                             8288
Beautiful                               4037
Beautiful Day                           3951
Late At Night                           3059
ily (i love you baby) (feat. Emilee)    2956
                                        ... 
Vem Mim Amar                               1
Fast As You                                1
Bad Boy (with Young Thug)                  1
NO OPS                                     1
Gold                                       1
Name: count, Length: 3562, dtype: int64

In [28]:
traffic['track']= traffic['track'].fillna(traffic['track'].mode()[0])

In [29]:
traffic['isrc'].value_counts()

isrc
QZNWQ2070741    8288
GBAHS2100225    4028
QZHN92194591    3950
USAT22102236    3059
QZJRC1945204    2947
                ... 
QMCE32100057       1
QM24S2104798       1
USAT22102351       1
QZ93L2106845       1
USWL12100113       1
Name: count, Length: 709, dtype: int64

In [30]:
traffic['isrc']= traffic['isrc'].fillna(traffic['isrc'].mode()[0])

In [31]:
traffic.isna().sum()

event      0
date       0
country    0
city       0
artist     0
album      0
track      0
isrc       0
linkid     0
dtype: int64

In [41]:
traffic.head()

Unnamed: 0,event,date,country,city,artist,album,track,isrc,linkid
0,click,2021-08-21,Saudi Arabia,Jeddah,Tesher,Jalebi Baby,Jalebi Baby,QZNWQ2070741,2d896d31-97b6-4869-967b-1c5fb9cd4bb8
2,click,2021-08-21,India,Ludhiana,Reyanna Maria,So Pretty,So Pretty,USUM72100871,23199824-9cf5-4b98-942a-34965c3b0cc2
3,click,2021-08-21,France,Unknown,"Simone & Simaria, Sebastian Yatra",No Llores Más,No Llores Más,BRUM72003904,35573248-4e49-47c7-af80-08a960fa74cd
4,click,2021-08-21,Maldives,Malé,Tesher,Jalebi Baby,Jalebi Baby,QZNWQ2070741,2d896d31-97b6-4869-967b-1c5fb9cd4bb8
5,click,2021-08-21,United States,Los Angeles,KenTheMan,I'm Perfect,I'm Perfect,US39N2102090,190c7170-4044-4c97-9709-926917155b02


## Total pageview events

In [40]:
traffic[traffic['event']=='pageview'].shape[0]

73360

## Average number of pageview events per day

In [68]:
pageview_data = traffic[traffic['event'] == 'pageview']

In [69]:
pageview_counts_per_day = pageview_data.groupby('date').size()

In [70]:
average_pageviews_per_day = pageview_counts_per_day.mean()
average_pageviews_per_day

10480.0

## Total count of other recorded events

In [54]:
traffic[traffic['event']!='pageview'].shape[0]

49207

## Distribution of other recorded events

In [55]:
traffic[traffic['event']=='click'].shape[0]

32499

In [56]:
traffic[traffic['event']=='preview'].shape[0]

16708

## Geographical Distribution

In [72]:
pageview_data.head()

Unnamed: 0,event,date,country,city,artist,album,track,isrc,linkid
84043,pageview,2021-08-19,Saudi Arabia,Riyadh,Tesher,Jalebi Baby,Jalebi Baby,QZNWQ2070741,2d896d31-97b6-4869-967b-1c5fb9cd4bb8
84044,pageview,2021-08-19,United States,Absecon,KA$HDAMI,epiphany,Reparations!,QZFYY2115255,9c61dba1-9369-4ee4-a215-1d34581cd811
84046,pageview,2021-08-19,Ireland,Mullingar,Cardi B,Up,Up,USAT22100061,9d847c0e-6824-438f-9dc1-d76fca6dc87c
84047,pageview,2021-08-19,United Kingdom,Northampton,"Kayla Nicole, Taylor Girlz, Flo Milli","Bundles 2 (feat. Flo Milli, Taylor Girlz)","Bundles 2 (feat. Flo Milli, Taylor Girlz)",QMCE32100359,fbf4b935-f961-4b13-b1d8-45ad47093559
84048,pageview,2021-08-19,Saudi Arabia,Yanbu,Wun Two,econto - Single,econto,USAT22103621,950cfa6f-ed68-4de6-bbba-ddf697f0eeed


In [74]:
pageview_countries= pageview_data['country'].unique()
print(pageview_countries)

['Saudi Arabia' 'United States' 'Ireland' 'United Kingdom' 'France'
 'Guatemala' 'Jordan' 'Kuwait' 'Pakistan' 'Italy' 'Germany' 'Iraq' 'Peru'
 'India' 'Nicaragua' 'Rwanda' 'Tanzania' 'United Arab Emirates' 'Norway'
 'Oman' 'Bahamas' 'Algeria' 'Czechia' 'Mexico' 'Jamaica' 'Netherlands'
 'Colombia' 'Morocco' 'Australia' 'Myanmar' 'Uzbekistan' 'Austria'
 'Latvia' 'Turkey' 'Mauritania' 'Sri Lanka' 'Bosnia and Herzegovina'
 'Estonia' 'Nigeria' 'Bulgaria' 'Greece' 'El Salvador' 'Philippines'
 'Denmark' 'Serbia' 'Canada' 'Spain' 'Libya' 'Palestine' 'Chad' 'Ecuador'
 'Mali' 'Romania' 'Switzerland' 'Portugal' 'Slovenia' 'Iceland' 'Sweden'
 'Bahrain' 'Egypt' 'Lithuania' 'Liberia' 'Israel' 'Ukraine' 'Puerto Rico'
 'South Africa' 'Ghana' 'Kenya' 'Armenia' 'Nepal' 'Barbados' 'Azerbaijan'
 'Qatar' 'Uganda' 'Poland' 'Brazil' 'Guyana' 'Fiji' 'Bangladesh' 'Belgium'
 'Hong Kong' 'Haiti' 'Botswana' 'Ivory Coast' 'Bhutan' 'Indonesia'
 'Réunion' 'Cameroon' 'Singapore' 'Yemen' 'French Guiana' 'Slovakia'
 'D

## Click-Through Rate Analysis

In [75]:
total_clicks= traffic[traffic['event']=='click'].shape[0]
total_pageviews= traffic[traffic['event']=='pageview'].shape[0]

In [76]:
ctr= (total_clicks / total_pageviews)*100
ctr

44.30070883315158

In [79]:
link_group= traffic.groupby('linkid')['event'].value_counts().unstack(fill_value=0)
link_group

event,click,pageview,preview
linkid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
00073307-ae96-5089-a117-4783afb42f8e,0,2,0
00126b32-0c35-507b-981c-02c80d2aa8e7,2,2,0
0018cfff-50a1-5984-9715-01ef2d11a49a,0,1,0
0033934b-5d16-5a06-af58-d087bcdd3680,0,1,0
0034d6cf-3bd8-5ffe-aafc-b3959fc48608,0,1,0
...,...,...,...
fff38ca0-8043-50cd-a5f1-f65ebb7105c5,1,1,0
fff4e5f0-4ee5-5fe7-aa30-e870edaf6ed7,0,1,0
fff84c0e-90a1-59d8-9997-adc909d50e16,1,1,0
fffc17a7-f935-5d3e-bd3e-d761fd80d479,1,2,0


In [81]:
link_group['CTR'] = (link_group.get('click', 0) / link_group.get('pageview', 1)) * 100
link_group[['CTR']]

event,CTR
linkid,Unnamed: 1_level_1
00073307-ae96-5089-a117-4783afb42f8e,0.0
00126b32-0c35-507b-981c-02c80d2aa8e7,100.0
0018cfff-50a1-5984-9715-01ef2d11a49a,0.0
0033934b-5d16-5a06-af58-d087bcdd3680,0.0
0034d6cf-3bd8-5ffe-aafc-b3959fc48608,0.0
...,...
fff38ca0-8043-50cd-a5f1-f65ebb7105c5,100.0
fff4e5f0-4ee5-5fe7-aa30-e870edaf6ed7,0.0
fff84c0e-90a1-59d8-9997-adc909d50e16,100.0
fffc17a7-f935-5d3e-bd3e-d761fd80d479,50.0


## Correlation Analysis

In [82]:
link_group

event,click,pageview,preview,CTR
linkid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
00073307-ae96-5089-a117-4783afb42f8e,0,2,0,0.0
00126b32-0c35-507b-981c-02c80d2aa8e7,2,2,0,100.0
0018cfff-50a1-5984-9715-01ef2d11a49a,0,1,0,0.0
0033934b-5d16-5a06-af58-d087bcdd3680,0,1,0,0.0
0034d6cf-3bd8-5ffe-aafc-b3959fc48608,0,1,0,0.0
...,...,...,...,...
fff38ca0-8043-50cd-a5f1-f65ebb7105c5,1,1,0,100.0
fff4e5f0-4ee5-5fe7-aa30-e870edaf6ed7,0,1,0,0.0
fff84c0e-90a1-59d8-9997-adc909d50e16,1,1,0,100.0
fffc17a7-f935-5d3e-bd3e-d761fd80d479,1,2,0,50.0


In [83]:
clicks= link_group.get('click',0)
previews= link_group.get('preview',0)

In [84]:
correlation= clicks.corr(previews)
correlation

0.9634390538767612

In [91]:
from scipy.stats import pearsonr, chi2_contingency

In [86]:
correlation,p_value= pearsonr(clicks,previews)

In [87]:
correlation

0.9634390538767621

In [88]:
p_value

0.0

In [89]:
if p_value < 0.05:
    print("The correlation is statistically significant.")
else:
    print("The correlation is not statistically significant.")

The correlation is statistically significant.


- Yes, the correlation is statistically significant and there exists strong correlation between clicks and previews on link

## Linear Relationship (Pearson Correlation)

In [93]:
# Group by 'linkid' and count the number of 'click' and 'preview' events for each link
link_group = traffic.groupby('linkid')['event'].value_counts().unstack(fill_value=0)

# Extract the counts of clicks and previews
clicks = link_group.get('click', 0)
previews = link_group.get('preview', 0)

# Step 1: Pearson Correlation for Linear Relationship
correlation, p_value = pearsonr(clicks, previews)
print(f"Linear Relationship Test (Pearson Correlation):")
print(f"Correlation coefficient: {correlation:.2f}")
print(f"P-value: {p_value:.4f}")
if p_value < 0.05:
    print("The correlation is statistically significant.")
else:
    print("The correlation is not statistically significant.")

Linear Relationship Test (Pearson Correlation):
Correlation coefficient: 0.96
P-value: 0.0000
The correlation is statistically significant.


## Categorical Relationship (Chi-square Test)

In [98]:
# Create a contingency table
contingency_table = pd.crosstab(clicks, previews)

# Perform the Chi-square test
chi2, chi_p_value, dof, expected = chi2_contingency(contingency_table)
print("Categorical Relationship Test (Chi-square Test):")
print(f"Chi-square statistic: {chi2:.2f}")
print(f"P-value: {chi_p_value:.4f}")
if chi_p_value < 0.05:
    print("There is a statistically significant association between clicks and previews as categories.")
else:
    print("There is no statistically significant association between clicks and previews as categories.")

Categorical Relationship Test (Chi-square Test):
Chi-square statistic: 279043.41
P-value: 0.0000
There is a statistically significant association between clicks and previews as categories.
