## Setup

In [1]:
import pandas as pd
import json
import pickle
pd.set_option('display.max_columns', 50)

In [2]:
dataset = pd.read_csv("./data/data_train.csv")
dataset.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,804,1,0.8,1,12,1,41,0.9,89,1,13,709,818,2027,11,5,11,1,0,0,1
1,1042,0,2.2,0,15,1,11,0.6,139,5,16,68,1018,2826,18,0,2,1,0,0,2
2,1481,1,2.0,1,0,0,35,0.5,105,3,0,249,522,2635,17,16,4,1,0,1,2
3,1104,0,1.7,0,1,1,60,0.4,199,2,13,653,1413,1229,6,0,3,1,1,1,0
4,652,0,0.5,1,1,0,58,0.6,142,3,2,464,781,565,18,12,9,0,0,1,0


## Naive Bayes

In [3]:
# Making it into feat and target
x = dataset.copy().drop(['price_range'], axis=1)
y = dataset['price_range']

# Getting the column name
cols_to_bin = [col for col in dataset.columns if not dataset[col].isin([0, 1]).all()]
cols_to_bin.remove('price_range')

# Creating bins using qcut, it will save the info of the cut
def create_bins(dataset, num_bins, cols_to_bin):
    dataset = dataset.copy()
    binning_info = {}

    for column in cols_to_bin:

        # Compute bin ranges
        bins = pd.cut(dataset[column], bins=num_bins, retbins=True, labels=False)

        # Update binning information
        binning_info[column] = {
            'bin_edges': list(bins[1])
        }

        # Replace column values with bin labels
        dataset[column] = bins[0]

    return dataset, binning_info

# Convert function for new data
def convert_to_bin(data, binning_info):
    copied_data = data.copy()
    for column, info in binning_info.items():
        bin_edges = info['bin_edges']
        copied_data[column] = pd.cut(copied_data[column], bins=bin_edges, labels=False)

    return copied_data

binned_data, binning_info = create_bins(x, 4, cols_to_bin)

In [4]:
correlation_matrix = binned_data.corr()

# Menentukan kolom dengan korelasi kuat
strong_correlation_threshold = 0.5  # Ubah ambang sesuai kebutuhan

strong_correlations = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        col1 = correlation_matrix.columns[i]
        col2 = correlation_matrix.columns[j]
        correlation = correlation_matrix.iloc[i, j]
        if abs(correlation) >= strong_correlation_threshold:
            strong_correlations.append((col1, col2, correlation))

# Menampilkan kolom dengan korelasi kuat
print("Strong Correlations:")
for col1, col2, correlation in strong_correlations:
    print(f"{col1} - {col2}: {correlation}")

Strong Correlations:
fc - pc: 0.5778665847796204
four_g - three_g: 0.5955752002886632


In [5]:
cols_to_drop = ['fc', 'four_g']  # Kolom yang ingin dihapus
binned_data = binned_data.drop(cols_to_drop, axis=1)

In [6]:
binned_data.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,0,1,0,1,2,3,0,0,2,1,0,1,1,1,1,1,0,0
1,1,0,2,0,0,2,1,2,3,0,1,2,3,0,0,1,0,0
2,2,1,2,1,2,1,0,1,0,0,0,2,3,3,0,1,0,1
3,1,0,1,0,3,1,3,0,2,1,2,1,0,0,0,1,1,1
4,0,0,0,1,3,2,2,1,0,0,0,0,3,2,1,0,0,1


In [7]:
def calculate_probabilities(feature_dataset, target_dataset, alpha=1):
    probabilities = {}

    # Calculate prior probabilities for each target class
    target_classes = target_dataset.unique()
    total_count = len(target_dataset)
    prior_probabilities = {str(target_class): (len(target_dataset[target_dataset == target_class]) + alpha) / (total_count + alpha * len(target_classes)) for target_class in target_classes}
    probabilities['prior_probability'] = prior_probabilities

    # Iterate over each feature column
    for feature_column in feature_dataset.columns:
        probabilities[feature_column] = {}

        # Get unique classes for the current feature column
        feature_classes = feature_dataset[feature_column].unique()

        # Iterate over each target class
        for target_class in target_classes:
            probabilities[feature_column][str(target_class)] = {}

            # Calculate the probability of each feature class given the target class
            for feature_class in feature_classes:
                class_count = len(feature_dataset[(feature_dataset[feature_column] == feature_class) & (target_dataset == target_class)])
                total_count = len(target_dataset[target_dataset == target_class])
                probability = (class_count + alpha) / (total_count + alpha * len(feature_classes))
                probabilities[feature_column][str(target_class)][str(feature_class)] = probability

    return probabilities


In [8]:
probability = calculate_probabilities(binned_data, y)

In [9]:
probability

{'prior_probability': {'1': 0.25427350427350426,
  '2': 0.24643874643874644,
  '0': 0.2556980056980057,
  '3': 0.24358974358974358},
 'battery_power': {'1': {'0': 0.24166666666666667,
   '1': 0.275,
   '2': 0.25,
   '3': 0.23333333333333334},
  '2': {'0': 0.2808022922636103,
   '1': 0.24355300859598855,
   '2': 0.23782234957020057,
   '3': 0.23782234957020057},
  '0': {'0': 0.3259668508287293,
   '1': 0.3314917127071823,
   '2': 0.19613259668508287,
   '3': 0.1464088397790055},
  '3': {'0': 0.17391304347826086,
   '1': 0.19710144927536233,
   '2': 0.30144927536231886,
   '3': 0.32753623188405795}},
 'blue': {'1': {'1': 0.4720670391061452, '0': 0.5279329608938548},
  '2': {'1': 0.5014409221902018, '0': 0.49855907780979825},
  '0': {'1': 0.475, '0': 0.525},
  '3': {'1': 0.5276967930029155, '0': 0.47230320699708456}},
 'clock_speed': {'1': {'0': 0.39444444444444443,
   '2': 0.2,
   '1': 0.20555555555555555,
   '3': 0.2},
  '2': {'0': 0.37822349570200575,
   '2': 0.20630372492836677,
   '1

In [10]:
def predict(dataset, probabilities):
    predictions = []

    for _, row in dataset.iterrows():
        posterior_probs = {target_class: probabilities['prior_probability'][target_class] for target_class in probabilities['prior_probability']}

        for feature_col in dataset.columns:
            for target_class in probabilities[feature_col]:
                posterior_probs[target_class] *= probabilities[feature_col][target_class][str(row[feature_col])]

        predicted_class = max(posterior_probs, key=posterior_probs.get)
        predictions.append(predicted_class)

    return predictions

def calc_accuracy(actual_target, predicted_target):
    correct = sum(1 for actual, predicted in zip(actual_target, predicted_target) if str(actual) == predicted)
    total = len(actual_target)
    accuracy = correct / total
    return accuracy

In [11]:
datasetVal = pd.read_csv("./data/data_validation.csv")

cols_to_bin_val = [col for col in dataset.columns if not dataset[col].isin([0, 1]).all() and col not in cols_to_drop]
cols_to_bin_val.remove('price_range')

binned_data_val = convert_to_bin(datasetVal, binning_info)
binned_data_val = binned_data_val.drop(['price_range'] + cols_to_drop, axis=1)
binned_data_val

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,0,0,2,0,0,2,3,0,0,0,0,2,1,1,3,0,0,1
1,0,1,0,1,2,3,2,2,1,2,3,2,1,0,1,1,1,0
2,0,1,3,1,0,0,3,2,0,0,0,3,0,0,1,0,0,0
3,0,0,0,0,2,2,2,0,3,1,3,0,3,3,1,1,1,1
4,0,0,0,1,1,0,1,1,3,0,0,3,1,0,3,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,2,1,3,0,3,1,1,0,0,0,1,1,1,0,3,0,1,1
596,3,0,2,0,2,3,1,3,3,0,0,3,3,1,3,1,1,0
597,0,1,3,1,1,0,3,1,0,1,3,0,0,0,0,1,1,1
598,3,1,3,1,2,0,3,1,0,1,3,1,1,2,3,1,1,1


In [12]:
predicted_values = predict(binned_data_val, probability)

In [13]:
accuracyScore = calc_accuracy(datasetVal['price_range'], predicted_values)
accuracyScore

0.76

### Save Model using Pickle

In [14]:
class NaiveBayesPhoneModel:
  def __init__(self, probability, binning_info):
    self.binning_info = binning_info
    self.probability = probability
    self.cols_to_drop = ['fc', 'four_g']
  
  def __convert_to_bin(self, data, binning_info):
    copied_data = data.copy()
    for column, info in binning_info.items():
        bin_edges = info['bin_edges']
        copied_data[column] = pd.cut(copied_data[column], bins=bin_edges, labels=False)

    return copied_data
  
  def predict(self, predict_data):
        # preprocess
    binned_predict_data = self.__convert_to_bin(predict_data, self.binning_info)
    binned_predict_data = binned_predict_data.drop(['price_range'] + self.cols_to_drop, axis=1)

    predictions = []

    for _, row in binned_predict_data.iterrows():
        posterior_probs = {target_class: self.probability['prior_probability'][target_class] for target_class in self.probability['prior_probability']}

        for feature_col in binned_predict_data.columns:
            for target_class in self.probability[feature_col]:
                posterior_probs[target_class] *= self.probability[feature_col][target_class][str(row[feature_col])]

        predicted_class = max(posterior_probs, key=posterior_probs.get)
        predictions.append(predicted_class)

    return predictions
  

In [15]:
NB_obj_phone_model = NaiveBayesPhoneModel(probability, binning_info)
with open("./model/NB_phone_model.pkl", 'wb') as f:
    pickle.dump(NB_obj_phone_model, f)

### Test the model

In [16]:
# Load the serialized object from file
with open("./model/NB_phone_model.pkl", "rb") as file:
    serialized_obj = file.read()

# Deserialize the object
new_model = pickle.loads(serialized_obj)

pred = new_model.predict(datasetVal)
calc_accuracy(datasetVal['price_range'], pred)

0.76