In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv('afghanForum.csv', error_bad_lines=False)
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
messages = df[["Message"]]

In [None]:
import textblob
import re
import nltk
import string
from collections import Counter

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def word_freq(messages, TEXTS):
    freq_df = messages
    freq_words = freq_df[TEXTS].tolist()
    freq_words = [i.lower() for i in freq_words]
    freq_punc = []
    
    for o in freq_words:
        freq_punc += nltk.word_tokenize(o)
        
    freq_punc = [o for o in freq_punc if o not in string.punctuation]
    freq_freq = Counter(freq_punc)
    
    freq_top = freq_freq.most_common(20)
    words = [word for word, _ in freq_top]
    counts = [counts for _, counts in freq_top]
    
    plt.bar(words, counts)
    plt.title("MOST COMMON 20 WORDS")
    plt.ylabel("Frequency")
    plt.xlabel("Words")
    plt.xticks(rotation=90)
    plt.show()
    
    return freq_top

In [None]:
freq_top = word_freq(messages, 'Message')

In [None]:
from wordcloud import WordCloud

In [None]:
def print_wordcloud(dict_top):
    dict_top = dict(dict_top)
    wordcloud = WordCloud(
        width=500, 
        height=500, 
        background_color='white', 
        min_font_size=5).generate_from_frequencies(dict_top)
    
    plt.figure()
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.tight_layout(pad=0)
    plt.show()

In [None]:
print_wordcloud(freq_top)

In [None]:
from nltk.corpus import stopwords
stop_words_list = stopwords.words('english')

In [None]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\d', ' ', text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    words = text.split()
    words = [word for word in words if not word in stop_words_list]
    words = [re.sub(r'(.)\1{1,}', r'\1\1', word) for word in words]
    words = [word.strip() for word in words if len(word.strip()) > 1]
    text = ' '.join(words)
    return text

In [None]:
messages['cleaned'] = messages['Message'].apply(preprocess_text)

In [None]:
messages.head()

In [None]:
def get_message_sentiment(message):
    analysis = textblob.TextBlob(message)
    if analysis.sentiment.polarity > 0:
        return 'POSITIVE'
    elif analysis.sentiment.polarity == 0:
        return 'NEUTRAL'
    else:
        return 'NEGATIVE'

In [None]:
messages["sentiment"] = messages["cleaned"].apply(get_message_sentiment)

In [None]:
messages.head()

In [None]:
messages.sentiment.value_counts().plot(kind='pie', autopct='%.2f%%')

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
def label_encoder(column):
    le = LabelEncoder().fit(column)
    print(column.name, le.classes_)
    return le.transform(column)

In [None]:
messages["sentiment"] = label_encoder(messages["sentiment"])

In [None]:
messages.head()

In [None]:
X = messages["cleaned"]
y = messages["sentiment"]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
print("X train: ", X_train.shape)
print("X test: ", X_test.shape)
print("Y train: ", y_train.shape)
print("Y test: ", y_test.shape)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [None]:
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [None]:
cv = CountVectorizer()
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
rf_tfidf = RandomForestClassifier()
rf_tfidf.fit(X_train_tfidf, y_train)
pred_rf_tfidf = rf_tfidf.predict(X_test_tfidf)
ac_rf_tfidf = accuracy_score(pred_rf_tfidf, y_test)
print("RF-TFIDF Accuracy Score: ", ac_rf_tfidf)

In [None]:
cm = confusion_matrix(pred_rf_tfidf, y_test)
sns.heatmap(cm, annot=True, fmt=".2f", cmap="Pastel1")

In [None]:
rf_cv = RandomForestClassifier()
rf_cv.fit(X_train_cv, y_train)
pred_rf_cv = rf_cv.predict(X_test_cv)
ac_rf_cv = accuracy_score(pred_rf_cv, y_test)
print("RF-CV Accuracy Score: ", ac_rf_cv)

In [None]:
cm = confusion_matrix(pred_rf_cv, y_test)
sns.heatmap(cm, annot=True, fmt=".2f", cmap="Pastel1")

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dt_tfidf = DecisionTreeClassifier()
dt_tfidf.fit(X_train_tfidf, y_train)
pred_dt_tfidf = dt_tfidf.predict(X_test_tfidf)
ac_dt_tfidf = accuracy_score(pred_dt_tfidf, y_test)
print("DT-TFIDF Accuracy Score: ", ac_dt_tfidf)

In [None]:
cm = confusion_matrix(pred_dt_tfidf, y_test)
sns.heatmap(cm, annot=True, fmt=".2f", cmap="Pastel1")

In [None]:
dt_cv = DecisionTreeClassifier()
dt_cv.fit(X_train_cv, y_train)
pred_dt_cv = dt_cv.predict(X_test_cv)
ac_dt_cv = accuracy_score(pred_dt_cv, y_test)
print("DT-CV Accuracy Score: ", ac_dt_cv)

In [None]:
cm = confusion_matrix(pred_dt_cv, y_test)
sns.heatmap(cm, annot=True, fmt=".2f", cmap="Pastel1")

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
mnb_tfidf = MultinomialNB()
mnb_tfidf.fit(X_train_tfidf, y_train)
pred_mnb_tfidf = mnb_tfidf.predict(X_test_tfidf)
ac_mnb_tfidf = accuracy_score(pred_mnb_tfidf, y_test)
print("MNB-TFIDF Accuracy Score: ", ac_mnb_tfidf)

In [None]:
mnb_cv = MultinomialNB()
mnb_cv.fit(X_train_cv, y_train)
pred_mnb_cv = mnb_cv.predict(X_test_cv)
ac_mnb_cv = accuracy_score(pred_mnb_cv, y_test)
print("MNB-CV Accuracy Score: ", ac_mnb_cv)

In [None]:
models_and_scores = {
    "RF-TFIDF": ac_rf_tfidf,
    "RF-CV": ac_rf_cv,
    "DT-TFIDF": ac_dt_tfidf,
    "DT-CV": ac_dt_cv,
    "MNB-TFIDF": ac_mnb_tfidf,
    "MNB-CV": ac_mnb_cv
}

In [None]:
plt.bar(*zip(*models_and_scores.items()))