In [None]:
# Total and Daily Page events

In [1]:
import pandas as pd

# Load the dataset
data = pd.read_csv('traffic.csv')

# Convert the date column to datetime format
data['date'] = pd.to_datetime(data['date'])

# Filter pageview events
pageviews = data[data['event'] == 'pageview']

# Calculate total number of pageview events
total_pageviews = len(pageviews)

# Calculate average pageviews per day
daily_pageviews = pageviews.groupby(pageviews['date'].dt.date).size()
average_daily_pageviews = daily_pageviews.mean()

print("Total Pageviews:", total_pageviews)
print("Average Daily Pageviews:", average_daily_pageviews)


Total Pageviews: 142015
Average Daily Pageviews: 20287.85714285714


In [None]:
# Analysis of other events

In [2]:
# Count the occurrences of each event type
event_distribution = data['event'].value_counts()

print("Event Distribution:")
print(event_distribution)


Event Distribution:
event
pageview    142015
click        55732
preview      28531
Name: count, dtype: int64


In [None]:
# Geographical Distribution

In [3]:
# Filter pageview events
pageview_countries = pageviews['country'].value_counts()

print("Countries Contributing to Pageviews:")
print(pageview_countries)


Countries Contributing to Pageviews:
country
Saudi Arabia         28873
India                27286
United States        20839
France                9674
Iraq                  4897
                     ...  
Wallis and Futuna        1
Solomon Islands          1
Guinea-Bissau            1
Lesotho                  1
Saint Martin             1
Name: count, Length: 211, dtype: int64


In [None]:
# Click-Through Rate Analysis

In [4]:
# Count clicks and pageviews per link
clicks = data[data['event'] == 'click'].groupby('linkid').size()
pageviews = data[data['event'] == 'pageview'].groupby('linkid').size()

# Calculate overall CTR
overall_ctr = clicks.sum() / pageviews.sum()

# Calculate CTR per link
ctr_per_link = (clicks / pageviews).dropna()

print("Overall CTR:", overall_ctr)
print("CTR per Link:")
print(ctr_per_link)


Overall CTR: 0.3924374185825441
CTR per Link:
linkid
00126b32-0c35-507b-981c-02c80d2aa8e7    1.000000
004b9724-abca-5481-b6e9-6148a7ca00a5    1.000000
0063a982-41cd-5629-96d0-e1c4dd72ea11    0.666667
006af6a0-1f0d-4b0c-93bf-756af9071c06    0.222222
00759b81-3f04-4a61-b934-f8fb3185f4a0    0.750000
                                          ...   
ffd8d5a7-91bc-48e1-a692-c26fca8a8ead    0.345238
fff38ca0-8043-50cd-a5f1-f65ebb7105c5    1.000000
fff84c0e-90a1-59d8-9997-adc909d50e16    1.000000
fffc17a7-f935-5d3e-bd3e-d761fd80d479    0.500000
fffd0045-29de-522b-b5d8-35786363bf07    0.500000
Length: 2253, dtype: float64


In [None]:
# Correlation Analysis

In [6]:
from scipy.stats import pearsonr, spearmanr

# Convert the series to dataframes and give them column names
clicks_df = clicks.reset_index()
clicks_df.columns = ['linkid', 'clicks']

pageviews_df = pageviews.reset_index()
pageviews_df.columns = ['linkid', 'pageviews']

# Merge clicks and pageviews dataframes on 'linkid'
link_data = pd.merge(clicks_df, pageviews_df, on='linkid', how='inner')

# Calculate Pearson correlation
pearson_corr, pearson_p = pearsonr(link_data['clicks'], link_data['pageviews'])

# Calculate Spearman correlation
spearman_corr, spearman_p = spearmanr(link_data['clicks'], link_data['pageviews'])

print(f"Pearson Correlation: {pearson_corr}, p-value: {pearson_p}")
print(f"Spearman Correlation: {spearman_corr}, p-value: {spearman_p}")


Pearson Correlation: 0.9939838266311063, p-value: 0.0
Spearman Correlation: 0.8181542470924686, p-value: 0.0
