In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
def compute_accuracy(predictions, actual):
    p = predictions
    a = actual
    if type(p) is not np.ndarray:
        p = p.to_numpy()
    if type(a) is not np.ndarray:
        a = a.to_numpy()
    correct = 0
    for i in range(len(a)):
        if p[i] == a[i]:
            correct += 1
    return correct / len(a) * 100

In [3]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
# load data
sentiments = pd.read_csv("train.csv")
sentiments.dropna(inplace=True)

# label_enc = preprocessing.LabelEncoder()
# label_enc.fit(sentiments['IsPositive'])

# mapping = dict(zip(label_enc.classes_, label_enc.transform(label_enc.classes_)))
# print("Mapping:", mapping)

# sentiments["IsPositive"] = label_enc.transform(sentiments["IsPositive"])

X = sentiments["Text"]
y = sentiments["IsPositive"]
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.25, random_state=42)

In [4]:
from sklearn.naive_bayes import MultinomialNB

In [5]:
# tagalog - https://github.com/explosion/spaCy/discussions/6122
# english - https://gist.githubusercontent.com/ZohebAbai/513218c3468130eacff6481f424e4e64/raw/b70776f341a148293ff277afa0d0302c8c38f7e2/gist_stopwords.txt
# punctuations - ourselves
gist_file = open("gist_stopwords.txt", "r")
try:
    content = gist_file.read()
    stopwords = content.split(",")
finally:
    gist_file.close()
tagalog = ['ba',
 'eh',
 'kasi',
 'lang',
 'mo',
 'naman',
 'opo',
 'po',
 'si',
 'talaga',
 'yung',
]
punctuations = ['!', '.', ',', '\"', '\'', ';', ':', '?', '*', '%', '@', '&', '$', '^', '(', ')', '_', '-', '+', '=']
stopwords += tagalog
stopwords += punctuations

In [6]:
count_vect = CountVectorizer(stop_words=stopwords, token_pattern=r'[^\s]+')
count_vect.fit(X_train)

In [7]:
X_train_count_sparse_matrix = count_vect.transform(X_train)
X_validation_count_sparse_matrix = count_vect.transform(X_validation)

In [8]:
from sklearn.model_selection import ParameterGrid

In [9]:
sentiment_nb = MultinomialNB()
sentiment_nb.get_params()
hyperparameters = [{
    'alpha': [0.05,0.1,0.5,1,3,5,10,15],
    'fit_prior': [True, False]
}]

In [10]:
best_score = 0
for g in ParameterGrid(hyperparameters):
    print(g)
    
    sentiment_nb.set_params(**g)
    
    sentiment_nb.fit(X_train_count_sparse_matrix, y_train)
    predictions = sentiment_nb.predict(X_train_count_sparse_matrix)
    train_acc = compute_accuracy(predictions, y_train)
    
    predictions = sentiment_nb.predict(X_validation_count_sparse_matrix)
    val_acc = compute_accuracy(predictions, y_validation)
    
    print(f"Train acc: {train_acc}% \t Val acc: {val_acc}%", end="\n\n")
    
    if val_acc > best_score:
        best_score = val_acc
        best_grid = g

print("Best accuracy: ", best_score, "%")
print("Best grid: ", best_grid)

{'alpha': 0.05, 'fit_prior': True}
Train acc: 99.32228156807416% 	 Val acc: 88.10369586576978%

{'alpha': 0.05, 'fit_prior': False}
Train acc: 99.32228156807416% 	 Val acc: 88.1943919582798%

{'alpha': 0.1, 'fit_prior': True}
Train acc: 99.06782223118009% 	 Val acc: 88.41357418184566%

{'alpha': 0.1, 'fit_prior': False}
Train acc: 99.06782223118009% 	 Val acc: 88.49671226664651%

{'alpha': 0.5, 'fit_prior': True}
Train acc: 97.54484530887837% 	 Val acc: 88.85571763283198%

{'alpha': 0.5, 'fit_prior': False}
Train acc: 97.54484530887837% 	 Val acc: 88.94263472148741%

{'alpha': 1, 'fit_prior': True}
Train acc: 96.1226443615842% 	 Val acc: 88.8746126521049%

{'alpha': 1, 'fit_prior': False}
Train acc: 96.1226443615842% 	 Val acc: 88.98042476003324%

{'alpha': 3, 'fit_prior': True}
Train acc: 93.28328126574624% 	 Val acc: 88.51560728591943%

{'alpha': 3, 'fit_prior': False}
Train acc: 93.28076186637105% 	 Val acc: 88.60252437457487%

{'alpha': 5, 'fit_prior': True}
Train acc: 91.927844401

In [11]:
sentiment_nb = MultinomialNB(alpha=0.5, class_prior=None, fit_prior=True)
X_train_val_count_sparse_matrix = count_vect.transform(X)
sentiment_nb.fit(X_train_val_count_sparse_matrix, y)

In [12]:
sentiments = pd.read_csv("test.csv")
sentiments.dropna(inplace=True)

X_test = sentiments["Text"]

X_test_val_count_sparse_matrix = count_vect.transform(X_test)
predictions = sentiment_nb.predict(X_test_val_count_sparse_matrix)
# print(predictions)

[ True  True False ...  True False False]


In [13]:
# print(sentiments['DocumentId'].tolist())

[1450521, 889575, 1168201, 81808, 188577, 1454424, 2007623, 1017748, 677156, 497475, 1402401, 473428, 2173302, 302989, 1072379, 232322, 935581, 1461194, 434002, 418987, 1733540, 2381377, 3121, 656395, 1973120, 2301012, 2264073, 2184878, 1517809, 720112, 1127436, 799134, 1063138, 2368037, 304197, 94249, 401787, 377053, 2044148, 440577, 1687845, 1908227, 253239, 1622830, 949555, 2219658, 2107788, 567522, 758259, 2322856, 2379505, 612916, 1583847, 2273183, 1649094, 2323193, 2099359, 2358415, 922135, 525222, 1537300, 1134961, 2186562, 1319729, 1447299, 449815, 2098860, 1776689, 2260691, 634823, 1703306, 1681282, 610233, 1306256, 565692, 626879, 1633371, 1939088, 1802835, 1346194, 1325774, 950450, 385149, 963728, 439821, 771534, 1983052, 667530, 544471, 919487, 1017094, 1902587, 2335456, 2012505, 1836082, 1166925, 605353, 1617490, 1655205, 195658, 887164, 1640232, 19799, 557844, 2059765, 844472, 1044028, 1295158, 478581, 968333, 2164088, 109155, 372103, 580545, 770853, 1637273, 59321, 13063

In [14]:
import csv
file = open('submission.csv', 'w+', newline ='')

pred = []
ids = sentiments['DocumentId'].tolist()
header = ['DocumentId', 'IsPositive']
for i in range(len(predictions)):
    pred.append({'DocumentId': str(ids[i]),
                'IsPositive' : str(predictions[i])})
# print(pred)
with file:
    writer = csv.DictWriter(file, fieldnames = header)
    header = ['DocumentId', 'IsPositive']
    writer.writeheader()
    writer.writerows(pred)

[{'DocumentId': '1450521', 'IsPositive': 'True'}, {'DocumentId': '889575', 'IsPositive': 'True'}, {'DocumentId': '1168201', 'IsPositive': 'False'}, {'DocumentId': '81808', 'IsPositive': 'False'}, {'DocumentId': '188577', 'IsPositive': 'True'}, {'DocumentId': '1454424', 'IsPositive': 'True'}, {'DocumentId': '2007623', 'IsPositive': 'True'}, {'DocumentId': '1017748', 'IsPositive': 'False'}, {'DocumentId': '677156', 'IsPositive': 'False'}, {'DocumentId': '497475', 'IsPositive': 'False'}, {'DocumentId': '1402401', 'IsPositive': 'False'}, {'DocumentId': '473428', 'IsPositive': 'False'}, {'DocumentId': '2173302', 'IsPositive': 'True'}, {'DocumentId': '302989', 'IsPositive': 'False'}, {'DocumentId': '1072379', 'IsPositive': 'True'}, {'DocumentId': '232322', 'IsPositive': 'False'}, {'DocumentId': '935581', 'IsPositive': 'False'}, {'DocumentId': '1461194', 'IsPositive': 'True'}, {'DocumentId': '434002', 'IsPositive': 'True'}, {'DocumentId': '418987', 'IsPositive': 'False'}, {'DocumentId': '1733