In [45]:
import numpy as np
import nltk
import pickle
import re
from nltk.corpus import stopwords
import pandas as pd
import string
from sklearn.naive_bayes import BernoulliNB
from nltk import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("dreaddit-train.csv")

In [4]:
df.shape

(2838, 116)

In [8]:
stop_words_list = stopwords.words("english")

In [9]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"\n", " ", text)
    text = re.sub(r"\d", " ", text)
    text = re.sub(r"<.*?>+", "", text)
    text = re.sub(r"\[.*?\]", "", text)
    text = re.sub(r"https?://S+|www.\.\S+", "", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    words = text.split()
    words = [word for word in words if not word in stop_words_list]
    words = [re.sub(r"(.)\1{1,}", r"\1\1", word) for word in words]
    words = [word.strip() for word in words if len(word.strip()) > 1]
    text = " ".join(words)
    return text

In [11]:
df["text"] = df["text"].apply(preprocess_text)

In [13]:
stemmer = SnowballStemmer("english")

In [14]:
def stemming(text):
    stemmed_text = ""
    for word in text.split():
        stem = stemmer.stem(word)
        stemmed_text += stem
        stemmed_text += " "
        
    stemmed_text = stemmed_text.strip()
    return stemmed_text

In [15]:
df["text"] = df["text"].apply(stemming)

In [16]:
df["label"] = df["label"].map({
    0: "No Stress", 1: "Stress"
})

In [18]:
X = df["text"]
X = np.array(X)

y = df["label"]
y = np.array(y)

In [20]:
cv = CountVectorizer()
X_scaled = cv.fit_transform(X)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=4242)

In [24]:
model = BernoulliNB()
model.fit(X_train, y_train)

In [31]:
model.score(X_test, y_test)

0.7312206572769953

In [44]:
text = "I'm so happy now."
test = cv.transform([text]).toarray()
model.predict(test)

array(['No Stress'], dtype='<U9')

In [46]:
pickle.dump(model, open("model.pkl", "wb"))

In [47]:
pickle.dump(cv, open("cv.pkl", "wb"))