In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
import string
import re

In [2]:
# Load your dataset
df =pd.read_csv(r'F:\Kabir\CSUN\R\preprocessed_reddit_comments_python_final.csv')

In [3]:
df.head(2)

Unnamed: 0,POSTNAME,AUTHOR,COMMENT,rejoined_tokenized_comments,sentiment,lemmatized
0,'Immaculate' Review Thread : r/boxoffice,Daydream_machine,Oof that’s admittedly lower than I expected. D...,oof admittedly lower expected disappointed rea...,negative,"['oof', 'admittedly', 'lower', 'expected', 'di..."
1,'Immaculate' Review Thread : r/boxoffice,Interesting_Tie_1870,There is a scene towards the beginning of the ...,scene towards beginning movie feature red flas...,positive,"['scene', 'towards', 'beginning', 'movie', 'fe..."


In [4]:
def punctuation_counter(text):
    return round((sum([1 for i in text if i in string.punctuation])*100/(len(text) - text.count(" "))),2)

df['punctuation_percent'] = df['COMMENT'].apply(punctuation_counter)

In [5]:
#Renaming to avoid sentiment class deletion during training.
df = df.rename(columns={'sentiment': 'sentiment_class'})

In [6]:
df.head(2)

Unnamed: 0,POSTNAME,AUTHOR,COMMENT,rejoined_tokenized_comments,sentiment_class,lemmatized,punctuation_percent
0,'Immaculate' Review Thread : r/boxoffice,Daydream_machine,Oof that’s admittedly lower than I expected. D...,oof admittedly lower expected disappointed rea...,negative,"['oof', 'admittedly', 'lower', 'expected', 'di...",3.38
1,'Immaculate' Review Thread : r/boxoffice,Interesting_Tie_1870,There is a scene towards the beginning of the ...,scene towards beginning movie feature red flas...,positive,"['scene', 'towards', 'beginning', 'movie', 'fe...",0.68


In [7]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df[['rejoined_tokenized_comments',
                                                        'punctuation_percent']],
                                                    df['sentiment_class'], test_size=0.2,
                                                    random_state=42)



In [8]:
# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

In [9]:
# Convert 'rejoined_tokenized_comments' column to Unicode strings
X_train['rejoined_tokenized_comments'] = X_train['rejoined_tokenized_comments'].values.astype('U')
X_test['rejoined_tokenized_comments'] = X_test['rejoined_tokenized_comments'].values.astype('U')

In [10]:
# Fit TF-IDF vectorizer on training text data
X_train_tfidf_text = tfidf_vectorizer.fit_transform(X_train['rejoined_tokenized_comments'])

# Get feature names (words) representing the column names
feature_names = tfidf_vectorizer.get_feature_names_out()

In [11]:
# Transform both training and test text data
X_train_tfidf_text = tfidf_vectorizer.transform(X_train['rejoined_tokenized_comments'])
X_test_tfidf_text = tfidf_vectorizer.transform(X_test['rejoined_tokenized_comments'])

In [12]:
# Convert TF-IDF transformed text features to DataFrame with meaningful column names
X_train_final = pd.DataFrame(X_train_tfidf_text.toarray(), columns=feature_names)
X_test_final = pd.DataFrame(X_test_tfidf_text.toarray(), columns=feature_names)

In [13]:
# Concatenate TF-IDF transformed text features with original numeric features

y_train.reset_index(drop=True, inplace=True)
X_train['punctuation_percent'].reset_index(drop=True, inplace=True)
X_train_final.reset_index(drop=True, inplace=True)

y_test.reset_index(drop=True, inplace=True)
X_test['punctuation_percent'].reset_index(drop=True, inplace=True)
X_test_final.reset_index(drop=True, inplace=True)

X_train_final3 = pd.concat([y_train, X_train_final, 
                           X_train['punctuation_percent']], axis=1)
X_test_final3 = pd.concat([y_test, X_test_final, 
                          X_test['punctuation_percent']], axis=1)

In [14]:
X_train_final3.tail()

Unnamed: 0,sentiment_class,aake,aapki,aapse,abhorrent,ability,abject,able,abomination,abrasive,...,zoo,zoom,zoomedin,zoomers,zooming,zz,zürich,äckligt,åt,punctuation_percent
2712,negative,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.23
2713,positive,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.08
2714,positive,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.65
2715,positive,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.44
2716,negative,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.12


In [15]:
X_train_final3.to_csv(r'F:\Kabir\CSUN\R\training_data_final.csv', index=False)
X_test_final3.to_csv(r'F:\Kabir\CSUN\R\testing_data_final.csv', index=False)

In [16]:
import pickle
with open(r'F:\Kabir\CSUN\R\tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(tfidf_vectorizer, file)