In [33]:
# Imports
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
import sklearn


In [34]:
# Import data
df = pd.read_csv('data/train.csv')
# Reduce dataframe the number of lines by half
df = df.sample(frac=0.1)

In [35]:
# Analyze data
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 96733 entries, 232459 to 189296
Data columns (total 14 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   auctionId                      96733 non-null  object
 1   timeStamp                      96733 non-null  int64 
 2   placementId                    96733 non-null  int64 
 3   websiteId                      96733 non-null  int64 
 4   hashedRefererDeepThree         95127 non-null  object
 5   country                        96692 non-null  object
 6   opeartingSystem                96733 non-null  object
 7   browser                        96733 non-null  object
 8   browserVersion                 83692 non-null  object
 9   device                         96733 non-null  object
 10  environmentType                96733 non-null  object
 11  integrationType                96733 non-null  int64 
 12  articleSafenessCategorization  96733 non-null  object


In [36]:
df.head()

Unnamed: 0,auctionId,timeStamp,placementId,websiteId,hashedRefererDeepThree,country,opeartingSystem,browser,browserVersion,device,environmentType,integrationType,articleSafenessCategorization,isSold
232459,be26fcec-7cc4-4b9f-885c-d94b508a95ad_03e220e4-...,1603735983,123345,69763,7c94e8c8fb968cdd1ff4bddac790556d656af120d30e79...,US,Android,Facebook App,293_0,Phone,js-web,2,safe,True
683110,0a5a080a-c645-42b7-b1d8-1dd47b2de97c_25295693-...,1604070593,130592,73311,2cb327b11ed49af0e73551c25a7144c5019a4b8b4084fe...,US,Windows,Microsoft Edge,86_0,PC,js-web,2,safe,False
691207,79303d5b-8c44-4f41-81be-105ad2ff16ca_db44b39d-...,1604244729,129333,72690,3d2ea7c5f733ff8e6e87414a16740b2f79873352018708...,US,iOS,Facebook App,,Phone,js-web,2,safe,False
541475,cf9acf80-fe6e-4089-98e9-ca24493eb6c1_80e0baa8-...,1604013034,128228,72243,c2cd34619ad2db3032003ca151da887e098a7b0de174de...,US,Windows,Microsoft Edge,86_0,PC,js-web,2,uncat,False
127020,8d4559ed-e028-45d4-83f0-e2fa0063d869_83496ab0-...,1603739347,121842,68951,e01175c1daea2889a6c8801cf8013bdf143f56bb99c2a2...,US,macOS,Microsoft Edge,80_0,PC,js-web,2,safe,False


In [37]:
df.describe()

Unnamed: 0,timeStamp,placementId,websiteId,integrationType
count,96733.0,96733.0,96733.0,96733.0
mean,1603976000.0,114013.889407,57592.643989,1.80563
std,176154.8,17712.274271,14251.176575,0.395717
min,1603670000.0,18341.0,13734.0,1.0
25%,1603823000.0,108383.0,48916.0,2.0
50%,1603976000.0,120706.0,60485.0,2.0
75%,1604132000.0,124501.0,68951.0,2.0
max,1604275000.0,133258.0,74126.0,2.0


In [38]:
# get every unique value in the column 'opeartingSystem'
df.opeartingSystem.unique()


array(['Android', 'Windows', 'iOS', 'macOS', 'OS X', 'Fire OS',
       'Chrome OS', 'BSD', 'Linux', 'unknown', 'Tizen', 'KaiOS'],
      dtype=object)

In [39]:
# Histogram of the column 'opeartingSystem'
fig = px.histogram(df, x='opeartingSystem').update_xaxes(categoryorder='total descending')
fig.show()

In [40]:
# get every unique value in the column 'browser'
df.browser.unique()
# get the count of unique values in the column 'browser'
df.browser.nunique()

78

In [41]:
# Histogram of the column 'browser'
fig = px.histogram(df, x='browser').update_xaxes(categoryorder="total descending")
fig.show()

In [42]:
# Histogram of the column 'country'
fig = px.histogram(df, x='country').update_xaxes(categoryorder="total descending")
fig.show()

In [43]:
# Convert country code 'UK' to 'GB'
df['country'] = df['country'].replace('UK', 'GB')

In [44]:
# Histogram of the column 'timestamp'
fig = px.histogram(df, x='timeStamp')
fig.show()

In [45]:
# Function that converts the timestamp to a datetime object
def convert_timestamp(timestamp):
    date = pd.to_datetime(timestamp, unit='s')
    # extract the hour from the timestamp
    hour = date.hour
    return hour

In [46]:
# Convert the timestamp to a datetime object
df['time'] = df['timeStamp'].apply(convert_timestamp)

In [47]:
# Histogram of the column 'time'
df_fr = df[df['country'] == 'US']
fig = px.histogram(df_fr, x='time', color='isSold').update_xaxes(categoryorder="total descending")
fig.show()

In [48]:
# For each hour, count the number of sold ads
hour_count = df_fr.groupby('time')['isSold'].mean()
    
# Plot the number
fig = px.bar(hour_count, x=hour_count.index, y=hour_count.values).update_xaxes(categoryorder="total descending")
fig.show()

In [49]:
# For each country, create a individual datatframe with the mean of the sold ads for each hour
# Get all individual country
country_list = df.country.unique()

# Concatenate time and country columns to create a new dataframe
df['time_country'] = df['country'].astype(str) + '_' + df['time'].astype(str)

# Show the first 5 rows of the dataframe
df.head()


Unnamed: 0,auctionId,timeStamp,placementId,websiteId,hashedRefererDeepThree,country,opeartingSystem,browser,browserVersion,device,environmentType,integrationType,articleSafenessCategorization,isSold,time,time_country
232459,be26fcec-7cc4-4b9f-885c-d94b508a95ad_03e220e4-...,1603735983,123345,69763,7c94e8c8fb968cdd1ff4bddac790556d656af120d30e79...,US,Android,Facebook App,293_0,Phone,js-web,2,safe,True,18,US_18
683110,0a5a080a-c645-42b7-b1d8-1dd47b2de97c_25295693-...,1604070593,130592,73311,2cb327b11ed49af0e73551c25a7144c5019a4b8b4084fe...,US,Windows,Microsoft Edge,86_0,PC,js-web,2,safe,False,15,US_15
691207,79303d5b-8c44-4f41-81be-105ad2ff16ca_db44b39d-...,1604244729,129333,72690,3d2ea7c5f733ff8e6e87414a16740b2f79873352018708...,US,iOS,Facebook App,,Phone,js-web,2,safe,False,15,US_15
541475,cf9acf80-fe6e-4089-98e9-ca24493eb6c1_80e0baa8-...,1604013034,128228,72243,c2cd34619ad2db3032003ca151da887e098a7b0de174de...,US,Windows,Microsoft Edge,86_0,PC,js-web,2,uncat,False,23,US_23
127020,8d4559ed-e028-45d4-83f0-e2fa0063d869_83496ab0-...,1603739347,121842,68951,e01175c1daea2889a6c8801cf8013bdf143f56bb99c2a2...,US,macOS,Microsoft Edge,80_0,PC,js-web,2,safe,False,19,US_19


In [50]:
# get timezone for each country
from datetime import datetime, tzinfo
from dateutil import tz
import pytz
# Create a function that returns the local time for a gevin country
def get_local_time(country, time):
    dt_str = pd.to_datetime(time, unit='s').strftime('%m/%d/%Y %H:%M:%S')
    format = "%m/%d/%Y %H:%M:%S"
    # Create datetime object in local timezone
    dt = datetime.strptime(dt_str, format)
    # Create datetime object in UTC timezone
    dt_utc = dt.replace(tzinfo=pytz.UTC)
    # Create datetime object in country timezone
    try:
        country_tz = pytz.country_timezones(country)[0]
        dt_country = dt_utc.astimezone(pytz.timezone(country_tz))
        return dt_country.strftime('%H')
    except:
        return pd.NaT


In [51]:
# Create a new column with the local time for each country
df['local_time'] = df.apply(lambda row: get_local_time(row['country'], row['timeStamp']), axis=1)


In [52]:
# Count the number of NaN values in the column 'local_time'
df['local_time'].isna().sum()

41

In [53]:
# For each hour, count the number of sold ads
hour_count = df.groupby('local_time')['isSold'].mean()

fig = px.bar(hour_count, x=hour_count.index, y=hour_count.values).update_xaxes(categoryorder="total descending")
fig.show()

In [65]:
# Function that returns 'neutral' if the value is between 02 and 08
def neutral(hour):
    if hour == '02':
        return 'neutral'
    elif hour == '03':
        return 'neutral'
    elif hour == '04':
        return 'neutral'
    elif hour == '05':
        return 'neutral'
    elif hour == '06':
        return 'neutral'
    elif hour == '07':
        return 'neutral'    
    elif hour == '08':
        return 'neutral'
    else:
        return hour

        # Apply the function to the column 'local_time'
df['local_time'] = df['local_time'].apply(neutral)

        # Function that returns 'bad' if the value is between 18 and 23


def bad(hour):
    if hour == '18':
        return 'bad'
    elif hour == '19':
        return 'neutral'
    elif hour == '20':
        return 'neutral'
    elif hour == '21':
        return 'neutral'
    elif hour == '22':
        return 'neutral'
    elif hour == '23':
        return 'bad'
    else:
        return hour

# Apply the function to the column 'local_time'
df['local_time'] = df['local_time'].apply(bad)


In [66]:
df['local_time'].head(10)

232459     14
683110     11
691207     10
541475     19
127020     15
791309     10
674945     13
99865     bad
282825     12
559283     03
Name: local_time, dtype: object

In [67]:
# show every unique value of local_time
df.local_time.unique()

array(['14', '11', '10', '19', '15', '13', 'bad', '12', '03', '17',
       'neutral', '09', '22', '16', '21', '05', '00', '07', '01', '20',
       '06', '04', NaT], dtype=object)

In [68]:
# get_dumy for the column 'local_time'
df_dumy = pd.get_dummies(df['local_time']).rename(columns=lambda x: 'local_time_' + str(x))

In [69]:
# Show the first 5 rows of the dataframe
df_dumy

Unnamed: 0,local_time_00,local_time_01,local_time_03,local_time_04,local_time_05,local_time_06,local_time_07,local_time_09,local_time_10,local_time_11,...,local_time_14,local_time_15,local_time_16,local_time_17,local_time_19,local_time_20,local_time_21,local_time_22,local_time_bad,local_time_neutral
232459,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
683110,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
691207,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
541475,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
127020,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
510251,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
695120,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
238112,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
310896,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [70]:
# Make a prediction model based on the data
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df_dumy, df['isSold'], test_size=0.2, random_state=42)
    

In [71]:
# Fit the model
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [72]:
# compute the score
model.score(X_test, y_test)

0.5535741975500078