In [1]:
import nltk
import pandas as pd

In [3]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

porter = PorterStemmer()
wordnet = WordNetLemmatizer()

dataset_path = './dataset.csv'

dataset = pd.read_csv(dataset_path, sep=";")

dataset.head()


Unnamed: 0,selected_text,sentiment
0,Sooo SAD,negative
1,bullying me,negative
2,leave me alone,negative
3,"Sons of ****,",negative
4,fun,positive


In [4]:
from random import shuffle
from nltk import FreqDist
import pickle

stop_words = stopwords.words('english')

def preprocess(text):
    tokenized = word_tokenize(text)

    filtered = [word.lower() for word in tokenized if word.isalpha()]

    removed_stopwords = [word for word in filtered if word not in stop_words]

    result = [porter.stem(word) for word in removed_stopwords]
    result = [wordnet.lemmatize(word) for word in result]

    return result

all_words = []

for sentence in dataset["selected_text"]:
    for word in preprocess(sentence):
        all_words.append(word)

fd = FreqDist(all_words)

featured_word = {word for word in fd.most_common(100)}

feature_sets = []

for text, category in zip(dataset["selected_text"], dataset["sentiment"]):
    feature = {}

    processed_words = preprocess(text)

    for word in processed_words:
        feature[word] = (word in featured_word)
    
    feature_sets.append((feature, category))


shuffle(feature_sets)

split_index = int(len(feature_sets) * 0.8)
train_set, test_set = feature_sets[:split_index], feature_sets[split_index:]

classifier = nltk.NaiveBayesClassifier.train(train_set)

accuracy = nltk.classify.util.accuracy(classifier, test_set)
print(f'Accuracy: {accuracy}')

classifier.show_most_informative_features(10)

file = open('model.pickle', 'wb')
pickle.dump(classifier, file)
file.close()


Accuracy: 0.7948717948717948
Most Informative Features
                    hope = False          positi : negati =      9.2 : 1.0
                     fun = False          positi : negati =      6.4 : 1.0
                    good = False          positi : negati =      6.3 : 1.0
                  awesom = False          positi : negati =      5.8 : 1.0
                    best = False          positi : negati =      4.4 : 1.0
                    wish = False          positi : negati =      4.4 : 1.0
                    nice = False          positi : negati =      3.7 : 1.0
                    well = False          negati : positi =      3.6 : 1.0
                     day = False          positi : negati =      3.3 : 1.0
                     bad = False          negati : positi =      2.9 : 1.0
