In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.naive_bayes import GaussianNB

In [2]:
df = pd.read_csv('final_data.csv')
df.drop(['Unnamed: 0.1','Unnamed: 0'], axis=1, inplace=True)
df.head()

Unnamed: 0,log_price,accommodates,bathrooms,bedrooms,beds,Family/kid friendly,translation missing: en.hosting_amenity_50,Fire extinguisher,Cable TV,Buzzer/wireless intercom,...,cancellation_policy_moderate,cancellation_policy_strict,cancellation_policy_super_strict_30,cancellation_policy_super_strict_60,cleaning_fee_True,city_Chicago,city_DC,city_LA,city_NYC,city_SF
0,5.010635,3,1.0,1.0,1.0,1,1,0,0,0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,5.129899,7,1.0,3.0,3.0,1,1,1,0,0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,4.976734,5,1.0,1.0,3.0,1,1,1,1,1,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,6.620073,4,1.0,2.0,2.0,0,0,1,1,1,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,4.744932,2,1.0,0.0,1.0,0,0,1,0,0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [3]:
log_price_mean = df['log_price'].mean()

In [4]:
df['log_price'] = np.where(df['log_price']>log_price_mean , 1, 0)

In [5]:
df

Unnamed: 0,log_price,accommodates,bathrooms,bedrooms,beds,Family/kid friendly,translation missing: en.hosting_amenity_50,Fire extinguisher,Cable TV,Buzzer/wireless intercom,...,cancellation_policy_moderate,cancellation_policy_strict,cancellation_policy_super_strict_30,cancellation_policy_super_strict_60,cleaning_fee_True,city_Chicago,city_DC,city_LA,city_NYC,city_SF
0,1,3,1.0,1.0,1.0,1,1,0,0,0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,1,7,1.0,3.0,3.0,1,1,1,0,0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,1,5,1.0,1.0,3.0,1,1,1,1,1,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,1,4,1.0,2.0,2.0,0,0,1,1,1,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,0,2,1.0,0.0,1.0,0,0,1,0,0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74106,0,1,1.0,1.0,1.0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
74107,1,4,2.0,2.0,4.0,1,1,0,1,0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
74108,1,5,1.0,2.0,2.0,1,0,0,0,1,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
74109,1,2,1.0,0.0,2.0,0,0,0,0,0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [6]:
df.dropna(inplace=True)

In [7]:
df.isna().sum()

log_price       0
accommodates    0
bathrooms       0
bedrooms        0
beds            0
               ..
city_Chicago    0
city_DC         0
city_LA         0
city_NYC        0
city_SF         0
Length: 70, dtype: int64

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(['log_price'],axis=1), df['log_price'], test_size=0.3, random_state=42)

In [9]:
# Initialize the Gaussian Naive Bayes classifier
classifier = GaussianNB()

# Train the classifier on the training data
classifier.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = classifier.predict(X_test)

In [10]:
print(f"Naive Bayes: Number of mislabeled points out of a total {X_test.shape[0]} points: {(y_test != y_pred).sum()}")

Naive Bayes: Number of mislabeled points out of a total 22130 points: 6828


Doing feature selection

In [11]:
# Perform feature selection using chi-squared test
k = 10  # Select the top k features
selector = SelectKBest(chi2, k=k)
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

In [12]:
classifier = GaussianNB()
classifier.fit(X_train_selected, y_train)

# Make predictions on the testing data
y_pred = classifier.predict(X_test_selected)

In [13]:
print(f"Naive Bayes: Number of mislabeled points out of a total {X_test_selected.shape[0]} points: {(y_test != y_pred).sum()}")

Naive Bayes: Number of mislabeled points out of a total 22130 points: 5228


Feature Selection with test size 0.2

In [14]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(['log_price'],axis=1), df['log_price'], test_size=0.2, random_state=42)

In [15]:
# Perform feature selection using chi-squared test
k = 2  # Select the top k features
selector = SelectKBest(chi2, k=k)
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

classifier = GaussianNB()
classifier.fit(X_train_selected, y_train)

# Make predictions on the testing data
y_pred = classifier.predict(X_test_selected)

print(f"Naive Bayes: Number of mislabeled points out of a total {X_test_selected.shape[0]} points: {(y_test != y_pred).sum()}")

Naive Bayes: Number of mislabeled points out of a total 14754 points: 3492


In [16]:
selected_features = [features[i] for i in selected_feature_indices]
print("Selected Features:", selected_features)