In [1]:
import pandas as pd

In [7]:
# Set seed for reproducibility
np.random.seed(42)

# Generate synthetic data
city_population = np.random.randint(2_000_000, 30_000_000, size=100)

# Updated list of continents
continents = ['North America', 'South America', 'Asia', 'Europe']
continent = np.random.choice(continents, size=100)

venue_capacity = np.random.choice([8000, 11000, 12300, 15000, 14900, 22800, 33200], size=100)
days_of_week = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
day_of_week = np.random.choice(days_of_week, size=100)

multiple_concerts = np.random.choice([0, 1], size=100)
sold_out = np.random.choice([0, 1], size=100)

# Create the DataFrame
df = pd.DataFrame({
    'City Population': city_population,
    'Continent': continent,
    'Venue Capacity': venue_capacity,
    'Day of Week': day_of_week,
    'Multiple Concerts': multiple_concerts,
    'Sold Out': sold_out
})

In [8]:
df.head()

Unnamed: 0,City Population,Continent,Venue Capacity,Day of Week,Multiple Concerts,Sold Out
0,25200604,Europe,12300,Saturday,1,1
1,18094478,North America,12300,Friday,0,0
2,25327850,South America,22800,Wednesday,1,0
3,28858567,Asia,15000,Thursday,0,0
4,23081788,North America,11000,Thursday,1,1


In [9]:
df2 = pd.get_dummies(df[['Continent', 'Day of Week']]) 

In [10]:
df2

Unnamed: 0,Continent_Asia,Continent_Europe,Continent_North America,Continent_South America,Day of Week_Friday,Day of Week_Monday,Day of Week_Saturday,Day of Week_Sunday,Day of Week_Thursday,Day of Week_Tuesday,Day of Week_Wednesday
0,False,True,False,False,False,False,True,False,False,False,False
1,False,False,True,False,True,False,False,False,False,False,False
2,False,False,False,True,False,False,False,False,False,False,True
3,True,False,False,False,False,False,False,False,True,False,False
4,False,False,True,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...
95,False,False,True,False,True,False,False,False,False,False,False
96,False,True,False,False,False,False,False,False,False,False,True
97,False,False,True,False,False,False,True,False,False,False,False
98,False,True,False,False,False,False,False,False,False,False,True


In [11]:
df3 = pd.concat([df, df2], axis = 1)

In [12]:
df3

Unnamed: 0,City Population,Continent,Venue Capacity,Day of Week,Multiple Concerts,Sold Out,Continent_Asia,Continent_Europe,Continent_North America,Continent_South America,Day of Week_Friday,Day of Week_Monday,Day of Week_Saturday,Day of Week_Sunday,Day of Week_Thursday,Day of Week_Tuesday,Day of Week_Wednesday
0,25200604,Europe,12300,Saturday,1,1,False,True,False,False,False,False,True,False,False,False,False
1,18094478,North America,12300,Friday,0,0,False,False,True,False,True,False,False,False,False,False,False
2,25327850,South America,22800,Wednesday,1,0,False,False,False,True,False,False,False,False,False,False,True
3,28858567,Asia,15000,Thursday,0,0,True,False,False,False,False,False,False,False,True,False,False
4,23081788,North America,11000,Thursday,1,1,False,False,True,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,24373627,North America,33200,Friday,1,0,False,False,True,False,True,False,False,False,False,False,False
96,28477560,Europe,8000,Wednesday,1,0,False,True,False,False,False,False,False,False,False,False,True
97,3007293,North America,8000,Saturday,1,0,False,False,True,False,False,False,True,False,False,False,False
98,23551399,Europe,15000,Wednesday,1,1,False,True,False,False,False,False,False,False,False,False,True


In [13]:
df4 = df3.drop(['Continent', 'Day of Week'], axis = 1)

In [14]:
df4

Unnamed: 0,City Population,Venue Capacity,Multiple Concerts,Sold Out,Continent_Asia,Continent_Europe,Continent_North America,Continent_South America,Day of Week_Friday,Day of Week_Monday,Day of Week_Saturday,Day of Week_Sunday,Day of Week_Thursday,Day of Week_Tuesday,Day of Week_Wednesday
0,25200604,12300,1,1,False,True,False,False,False,False,True,False,False,False,False
1,18094478,12300,0,0,False,False,True,False,True,False,False,False,False,False,False
2,25327850,22800,1,0,False,False,False,True,False,False,False,False,False,False,True
3,28858567,15000,0,0,True,False,False,False,False,False,False,False,True,False,False
4,23081788,11000,1,1,False,False,True,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,24373627,33200,1,0,False,False,True,False,True,False,False,False,False,False,False
96,28477560,8000,1,0,False,True,False,False,False,False,False,False,False,False,True
97,3007293,8000,1,0,False,False,True,False,False,False,True,False,False,False,False
98,23551399,15000,1,1,False,True,False,False,False,False,False,False,False,False,True


In [15]:
X = df4.drop(['Sold Out'], axis = 1)

In [16]:
y = df4['Sold Out']

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)

In [19]:
from sklearn.naive_bayes import GaussianNB 

In [20]:
gnb = GaussianNB()

In [21]:
gnb.fit(X_train, y_train)

In [22]:
y_pred = gnb.predict(X_test)

In [23]:
from sklearn.metrics import classification_report

In [24]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.33      0.22      0.27         9
           1       0.50      0.64      0.56        11

    accuracy                           0.45        20
   macro avg       0.42      0.43      0.41        20
weighted avg       0.42      0.45      0.43        20



In [25]:
gnb.score(X_train, y_train)

0.5

In [27]:
gnb.score(X_test, y_test)

0.45

In [29]:
# add in parameter 
param_grid = {
    'var_smoothing': [0.00000001, 0.000000001, 0.00000001]
}

In [30]:
from sklearn.model_selection import GridSearchCV

In [33]:
grid_search = GridSearchCV(gnb, param_grid, cv = 5, scoring = 'accuracy', n_jobs = -1)

In [35]:
grid_search.fit(X_train, y_train)

In [36]:
grid_search.best_params_

{'var_smoothing': 1e-08}

In [37]:
grid_search.best_score_

np.float64(0.375)