# Bag of Words + Logistic Regression

## Importing Libraries

In [36]:
import pandas as pd
import re
import nltk

## Importing the dataset

In [37]:
dataset_train = pd.read_csv(r'data/reddit-train.csv')
dataset_test = pd.read_csv('data/reddit-test.csv')
frames = [dataset_train, dataset_test]
dataset = pd.concat(frames)

# Select text and label columns
text = dataset["text"]
label = dataset["label"]
data_dict = {'text': text, 'label': label}
dataset = pd.DataFrame(data_dict)
print(dataset)

                                                  text  label
0    He said he had not felt that way before, sugge...      1
1    Hey there r/assistance, Not sure if this is th...      0
2    My mom then hit me with the newspaper and it s...      1
3    until i met my new boyfriend, he is amazing, h...      1
4    October is Domestic Violence Awareness Month a...      1
..                                                 ...    ...
710  i have horrible vivid nightmares every night. ...      1
711  Also I can't think about both of them without ...      1
712  Furthermore, I told him before we got really s...      1
713  Here's the link to my amazon wish list where t...      0
714  How can I keep us protected? They have already...      1

[3553 rows x 2 columns]


## Set up dictionary functions

In [38]:
nltk.download('punkt')
from nltk.tokenize import word_tokenize as wt

nltk.download('stopwords')
from nltk.corpus import stopwords

from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

from autocorrect import Speller
spell = Speller()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\soham\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\soham\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Extract Features from Text

In [39]:
data = []
for i in range(dataset.shape[0]):
    sms = dataset.iloc[i, 0]
    # remove non alphabatic characters
    sms = re.sub('[^A-Za-z]', ' ', sms)
    # make words lowercase, because Go and go will be considered as two words
    sms = sms.lower()
    # tokenising
    tokenized_sms = wt(sms)
    # remove stop words and stemming
    sms_processed = []
    for word in tokenized_sms:
        if word not in set(stopwords.words('english')):
            # sms_processed.append(stemmer.stem(word))
            sms_processed.append(spell(stemmer.stem(word)))
    sms_text = " ".join(sms_processed)
    data.append(sms_text)

## Creating the Feature Matrix

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
matrix = CountVectorizer(max_features=1000)
X = matrix.fit_transform(data).toarray()
y = dataset.iloc[:, 1]

## Split train and test data

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

## Train Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(max_iter = 100)
classifier.fit(X_train, y_train)

## Predict test set results

In [None]:
# predict class
y_pred = classifier.predict(X_test)

## Find Accuracy using Confusion matrix


In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

accuracy = accuracy_score(y_test, y_pred)

In [None]:
print(accuracy)

0.7271448663853727
