In [1]:
import re
import pandas as pd
import nltk
from collections import defaultdict
from nltk.probability import FreqDist, DictionaryProbDist, ELEProbDist, sum_logs

In [2]:
# using data frames to read bag of joy words
df = pd.read_csv("joy_words.txt", encoding="utf-8", index_col=None , names=["joy_words"])
joy_words = df["joy_words"].str.lower().to_list()

wr_directory = 'joy_tweets_trainset.txt'

In [3]:
# reading tweets as a df
# error_bad_lines to skip bad formatted lines
# namses to assign column names after separation
original_sentences_df = pd.read_csv(
"tweets_trainset_short.txt", 
    sep="\t", 
    error_bad_lines=False, 
    encoding="utf-8",
    index_col=None,
    names=["id", "timestamp", "sentence", "time"])

In [4]:
# regex patterns
indx = r'\@.+?\s'
html = r'(www|http:|https:)[^\s]+[\w]'
punct = r'[!"\\#\\$%\\&\\(\\)\\*\\+,\\-\\.\/:;<=>\\?@\\[\\\\\\\]\\^_`\\{\\|\\}\\~]'

In [5]:
# joining regex patterns
replace_patterns = "|".join([indx, html, punct])
minor_pattern = "\s+"

# replacing patterns, cleaning whitespaces, saving column as a string with astype(str)
original_sentences_df["sentence"] = original_sentences_df["sentence"].str.replace(replace_patterns, "", regex=True)
original_sentences_df["sentence"] = original_sentences_df["sentence"].str.replace(minor_pattern, " ", regex=True)
senteces_ready = original_sentences_df["sentence"].astype(str).str.lower().str.strip().to_list()
#senteces_ready

In [6]:
d = [] # combined list of joy and no_joy sentences
joy = [] 
no_joy = []

joy_counter = 0
no_joy_counter = 0

for sents in senteces_ready:
    sentences = sents.split(" ")
    x = set(sentences).intersection(joy_words)
    if x:
        d.append((" ".join(sentences), 'joy_sentence'))
        joy.append((" ".join(sentences), 'joy_sentence'))
        joy_counter += 1
    else:
        d.append((" ".join(sentences), 'no_joy_sentence'))
        no_joy.append((" ".join(sentences), 'no_joy_sentence'))
        no_joy_counter += 1
        
print(f"there're {joy_counter} examples of joy sentiment")
print(f"there're {no_joy_counter} examples of no joy sentiment")

there're 26968 examples of joy sentiment
there're 72178 examples of no joy sentiment


In [7]:
#with open(wr_directory, 'w', encoding="utf-8") as out:
    #p = "".join(str(d))
    #print(p)
    #out.write("".join(str(d)))

In [8]:
# print first 10 sentences to test
for i, line in enumerate(d):
    if i <= 10:
        #print(i, line)
        pass

In [9]:
# new list for splitted tockens
sents = [] 

# filtering sentences by word size and sentiments
for (words, sentiment) in joy + no_joy:
    words_filtered = [e.lower() for e in words.split() if len(e) >= 3]
    #print(words_filtered)
    sents.append((words_filtered, sentiment))

In [10]:
# functions creating a dict with words ordered by their appearance frequency
def get_words_in_sents(sents):
    all_words = []
    for (words, sentiment) in sents:
      all_words.extend(words)
    return all_words

def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)
    word_features = wordlist.keys()
    return word_features

In [11]:
word_features = get_word_features(get_words_in_sents(sents))
#word_features

In [12]:
def extract_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

In [13]:
# applying features to the classifier using the apply_features method and the tweets processed above
training_set = nltk.classify.apply_features(extract_features, sents)

In [None]:
# classifier training
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [None]:
# counting frequency of each label in the training set 
def train(labeled_featuresets, estimator=ELEProbDist):
    label_probdist = estimator(label_freqdist)
    feature_probdist = {}
    return NaiveBayesClassifier(label_probdist, feature_probdist)

In [None]:
# test
sent = 'I wish to pass Java 2 course in the next semester'
classifier.classify(extract_features(sent.split()))