## Import relevant libraries

In [6]:
# import library
import re
import contractions
import nltk
import pandas as pd
import numpy as np
import os

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from gensim.models import KeyedVectors

from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import confusion_matrix, accuracy_score

## Step 1: Preprocess Dataset

In [7]:
main_filepath = str(os.getcwd()) + "\\data"

input_filepaths = ["\\pre_depression.csv", "\\post_depression.csv", "\\post_finance.csv", "\\post_fitness.csv", "\\post_jokes.csv"]
output_filepaths = ["\\preproc_predepression.csv", "\\preproc_postdepression.csv", "\\preproc_postfinance.csv", "\\preproc_postfitness.csv", "\\preproc_postjokes.csv"]
train_fp = "\\train.csv"
validation_fp = "\\validation.csv"
test_fp = "\\test.csv"

In [8]:
def load_data(file_path):
    try:
        # Load the CSV file into a Pandas DataFrame
        df = pd.read_csv(file_path)
        return df
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}. Please provide a valid file path.")
        return None

def save_data(df, output_file):
    try:
        # Save the processed data to a new CSV file
        df.to_csv(output_file, index=False)
        print(f"Processed data saved to {output_file}.")
    except Exception as e:
        print(f"Error occurred while saving the processed data: {e}")

In [9]:
def contract_doc(doc):
    if doc == "nan" or doc == "":
        return ""
    clean_words = []

    for word in doc.split():
        #Remove special characters
        word_wo_sp = re.sub("[^a-zA-Z' ]", "", word)
        
        # using contractions.fix to expand the shortened words
        phrases = contractions.fix(word_wo_sp)

        for word in phrases.split():
            clean_words.append(word)

    #Filter out empty strings after regex replacement
    clean_words = filter(None, clean_words)
    
    #Concatenate back into string
    clean_string = ' '.join(char for char in clean_words)
    return clean_string

def remove_stopwords(doc):
    if doc == "nan" or doc == "":
        return ""
    
    word_tokens = word_tokenize(doc)
    #Try changing stopword corpus to include negation
    stop_words = set(stopwords.words('english'))
    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
    filtered_sentence = ' '.join(char for char in filtered_sentence)
    if filtered_sentence == '':
        return ""
    return filtered_sentence


In [10]:
overall_df = pd.DataFrame()
training_split = 0.7
validation_split = 0.15
testing_split = 0.15

for i in range(len(input_filepaths)):
    print("Processing file: " + input_filepaths[i])
    input_fp = main_filepath + input_filepaths[i]
    output_fp = main_filepath + output_filepaths[i]
    processed_df = pd.DataFrame()
    df = load_data(input_fp)

    processed_df['subreddit'] = df['subreddit']
    processed_df['post'] = df['post'].apply(lambda x: remove_stopwords(contract_doc(x)))
    processed_df['label'] = df['subreddit'].apply(lambda x: 1 if x == 'depression' else 0)
    
    # Save the processed DataFrame to a new CSV file
    processed_df.to_csv(output_fp, index=False)   
    overall_df = pd.concat([overall_df, processed_df], ignore_index=True)
    print("Done processing file: " + input_filepaths[i]) 

Processing file: \pre_depression.csv
Done processing file: \pre_depression.csv
Processing file: \post_depression.csv
Done processing file: \post_depression.csv
Processing file: \post_finance.csv
Done processing file: \post_finance.csv
Processing file: \post_fitness.csv
Done processing file: \post_fitness.csv
Processing file: \post_jokes.csv
Done processing file: \post_jokes.csv


In [11]:
# Split the data into training, validation, and testing sets
train_df, test_df = train_test_split(overall_df, test_size=1-training_split, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=validation_split/(1.-testing_split), random_state=42)

train_df = train_df.dropna()
val_df = val_df.dropna()
test_df = test_df.dropna()

train_df.to_csv(main_filepath + train_fp, index=False) 
val_df.to_csv(main_filepath + validation_fp, index=False) 
test_df.to_csv(main_filepath + test_fp, index=False) 

## Step 2: TfIdf vectorise

In [12]:
main_filepath = str(os.getcwd()) + "\\data"

#Tfidf based on entire training sets
train_input = "\\train.csv"
validation_input = "\\validation.csv"
test_input = "\\test.csv"

train_count = "\\train_count.csv"
validation_count = "\\validation_count.csv"
test_count = "\\test_count.csv"

train_tfidf = "\\train_tfidf.csv"
validation_tfidf = "\\validation_tfidf.csv"
test_tfidf = "\\test_tfidf.csv"

In [13]:
def load_data(file_path):
    try:
        # Load the CSV file into a Pandas DataFrame
        df = pd.read_csv(file_path)
        return df
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}. Please provide a valid file path.")
        return None

def save_data(df, output_file):
    try:
        # Save the processed data to a new CSV file
        df.to_csv(output_file, index=False)
        print(f"Processed data saved to {output_file}.")
    except Exception as e:
        print(f"Error occurred while saving the processed data: {e}")

In [14]:
print("Gathering Statistical Information: " + train_input)
train_input_fp = main_filepath + train_input
train_output_count = main_filepath + train_count
train_output_tfidf = main_filepath + train_tfidf

train_df = load_data(train_input_fp)
    
# Create the TfidfVectorizer
train_count_vectorizer = CountVectorizer(analyzer= 'word', stop_words='english', max_features=2048)
train_tfidf_vectorizer = TfidfVectorizer(analyzer='word',stop_words= 'english', max_features=2048)

# Fit and transform the documents
train_count_wm = train_count_vectorizer.fit_transform(train_df['post'].values.astype(str))
train_tfidf_wm = train_tfidf_vectorizer.fit_transform(train_df['post'].values.astype(str))

# Get feature names (words) and IDF values
train_count_tokens = train_count_vectorizer.get_feature_names_out()
train_tfidf_tokens = train_tfidf_vectorizer.get_feature_names_out()

traindf_countvect = pd.DataFrame(data = train_count_wm.toarray(), columns = train_count_tokens)
traindf_countvect = pd.concat([train_df, traindf_countvect], axis=1)

traindf_tfidfvect = pd.DataFrame(data = train_tfidf_wm.toarray(), columns = train_tfidf_tokens)
traindf_tfidfvect = pd.concat([train_df, traindf_tfidfvect], axis=1)

traindf_countvect.to_csv(train_output_count, index=False)
traindf_tfidfvect.to_csv(train_output_tfidf, index=False)

print("Done gathering Statistical Information: " + train_input)

Gathering Statistical Information: \train.csv
Done gathering Statistical Information: \train.csv


In [15]:
print("Gathering Statistical Information: " + test_input)
test_input_fp = main_filepath + test_input
test_output_count = main_filepath + test_count
test_output_tfidf = main_filepath + test_tfidf

test_df = load_data(test_input_fp)
    
# Create the TfidfVectorizer
test_count_vectorizer = CountVectorizer(analyzer= 'word', stop_words='english', max_features=2048)
test_tfidf_vectorizer = TfidfVectorizer(analyzer='word',stop_words= 'english', max_features=2048)

# Fit and transform the documents
test_count_wm = test_count_vectorizer.fit_transform(test_df['post'].values.astype(str))
test_tfidf_wm = test_tfidf_vectorizer.fit_transform(test_df['post'].values.astype(str))

# Get feature names (words) and IDF values
test_count_tokens = test_count_vectorizer.get_feature_names_out()
test_tfidf_tokens = test_tfidf_vectorizer.get_feature_names_out()

testdf_countvect = pd.DataFrame(data = test_count_wm.toarray(), columns = test_count_tokens)
testdf_countvect = pd.concat([test_df, testdf_countvect], axis=1)

testdf_tfidfvect = pd.DataFrame(data = test_tfidf_wm.toarray(), columns = test_tfidf_tokens)
testdf_tfidfvect = pd.concat([test_df, testdf_tfidfvect], axis=1)

testdf_countvect.to_csv(test_output_count, index=False)
testdf_tfidfvect.to_csv(test_output_tfidf, index=False)

print("Done gathering Statistical Information: " + test_input)

Gathering Statistical Information: \test.csv
Done gathering Statistical Information: \test.csv


Vector Embedding and Dot Multiplication

In [29]:
main_filepath = str(os.getcwd()) + "\\data"

#Tfidf based on entire training sets
train_input = "\\train_tfidf.csv"
validation_input = "\\validation_tfidf.csv"
test_input = "\\test_tfidf.csv"

# Load the Word2Vec model
model_path = 'C:\\Users\\benny\\Desktop\\Y4S1\\Natural_Language_Processing\\project\\pretrained_word2vec\\en_wiki_word2vec_300\\en_wiki_word2vec_300.txt'

In [17]:
w2v_m = KeyedVectors.load_word2vec_format(model_path, binary=False)

In [18]:
# Get the vector for a specific word
def get_embedding(word, word2vec_model):
    if word in word2vec_model:
        return word2vec_model[word]
    else:
        return np.zeros((300,))

In [25]:
print("Word2vec loaded. Generating word embedding: " + train_input)
train_input_fp = main_filepath + train_input
train_df = pd.read_csv(train_input_fp)

train_embed_vect = []
train_col_headers = train_df.columns.tolist()
train_col_headers = train_col_headers[3:]

for header in train_col_headers:
    header_embedding = get_embedding(header, w2v_m)
    flat_embed = header_embedding.flatten()
    train_embed_vect.append(flat_embed)
print(np.shape(train_embed_vect))
print("Training embedding generated: " + train_input)

Word2vec loaded. Generating word embedding: \train_tfidf.csv
(2048, 300)
Training embedding generated: \train_tfidf.csv


In [26]:
# Getting the tfidf matrix
train_tfidf_mat = train_df.iloc[:, 3:]
print(np.shape(train_tfidf_mat))

# Features (document) are stacked row by row in the matrix
train_vect_rep = np.matmul(train_tfidf_mat, train_embed_vect)
print(np.shape(train_vect_rep))

train_label = train_df['label'].values.flatten()
print("Training feature extracted: " + train_input)

(76921, 2048)
(76921, 300)
Training feature extracted: \train_tfidf.csv


In [30]:
print("Generating word embedding: " + test_input)
test_input_fp = main_filepath + test_input
test_df = pd.read_csv(test_input_fp)

test_embed_vect = []
test_col_headers = test_df.columns.tolist()
test_col_headers = test_col_headers[3:]

for header in test_col_headers:
    header_embedding = get_embedding(header, w2v_m)
    flat_embed = header_embedding.flatten()
    test_embed_vect.append(flat_embed)
print(np.shape(test_embed_vect))
print("Test embedding generated: " + test_input)

Generating word embedding: \test_tfidf.csv
(2048, 300)
Test embedding generated: \test_tfidf.csv


In [31]:
# Getting the tfidf matrix
test_tfidf_mat = test_df.iloc[:, 3:]
print(np.shape(test_tfidf_mat))

# Features (document) are stacked row by row in the matrix
test_vect_rep = np.matmul(test_tfidf_mat, test_embed_vect)
print(np.shape(test_vect_rep))

test_label = test_df['label'].values.flatten()
print("Testing feature extracted: " + test_input)

(40031, 2048)
(40031, 300)
Testing feature extracted: \train_tfidf.csv


## Fitting data into SVM

In [32]:
main_filepath = str(os.getcwd()) + "\\data"

test_input = "\\test.csv"

In [33]:
test_input_fp = main_filepath + test_input
test_df = pd.read_csv(test_input_fp)

classifier = svm.SVC(kernel = 'linear')
classifier.fit(train_vect_rep, train_label)

In [None]:
label_pred = classifier.predict(test_vect_rep)
test_label = test_df['label'].values.flatten()
test_df['predict'] = label_pred
test_df.to_csv(main_filepath + "\\final_linear_svm.csv", index=False)

cm = confusion_matrix(test_label, label_pred)
print(cm)
accuracy_score(test_label, label_pred)

In [None]:
test_input_fp = main_filepath + test_input
test_df = pd.read_csv(test_input_fp)

classifier2 = svm.SVC(kernel = 'rbf', gamma=0.1)
classifier2.fit(train_vect_rep, train_label)

In [None]:
label_pred = classifier2.predict(test_vect_rep)
test_label = test_df['label'].values.flatten()
test_df['predict'] = label_pred
test_df.to_csv(main_filepath + "\\final_rbf_svm.csv", index=False)

cm = confusion_matrix(test_label, label_pred)
print(cm)
accuracy_score(test_label, label_pred)