In [26]:
# Import necessary libraries
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer  # Changed to Snowball Stemmer
import re
from os import path
import string
import emoji


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\pepper\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pepper\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pepper\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\pepper\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [27]:
# Load the dataset
df = pd.read_csv('c:\\Users\\pepper\\Downloads\\filtered_tweets_text_only.csv', header=None)
df.columns = ['index', 'text']  


In [28]:
# Generate a new index format for the dataset
df['index'] = pd.Series(["D" + str(ind) for ind in df['index']])

In [29]:
# Display the first few rows of the dataset
print(df.head())

    index                                               text
0  Dindex                                               text
1      D0  100 Best Places to Visit in USA - Valley of Fi...
2      D1  Can we all agree that ye has a point\n\n#Kanye...
3      D2  2022-12-19T10:00:04.5267368Z▶▶50% OFF! Get you...
4      D3                           #KanyeWest  SUCK MY DİCK


In [30]:
# Remove commas, punctuation, and make the text lowercase
df['text'] = df['text'].str.replace(",", " ")
df['text'] = df['text'].str.replace(r'\W', ' ', regex=True)
df['text'] = df['text'].str.strip().str.lower()
df['text'] = df['text'].str.replace(f"[{string.punctuation}]", " ", regex=True)

In [31]:
# Define a function to remove emojis and emoticons
def remove_emoji_and_emoticons(string):
    # Remove emojis using the emoji library
    string = emoji.replace_emoji(string, replace='')
    # Regex to remove emoticons
    emoticon_pattern = re.compile(r'[:;=][oO\-]?[)\(\[\]DdpP]')
    string = emoticon_pattern.sub('', string)
    return string

In [32]:
# Apply the function to the 'text' column
df['text'] = df['text'].apply(remove_emoji_and_emoticons)

In [33]:
# Tokenize the text using word_tokenize
df['text'] = df['text'].apply(word_tokenize)

In [34]:
# Export tokenized text to a CSV file
df.to_csv('tokenized_text.csv', index=False)

In [35]:
# Remove non-informative tokens (e.g., punctuation-like tokens)
df['text'] = df['text'].apply(lambda x: [token for token in x if re.search(r'\w+', token)])

In [36]:
# Remove stopwords
stop_words = set(stopwords.words('english'))
df['text'] = df['text'].apply(lambda x: [word for word in x if word not in stop_words])

In [37]:
# Save the cleaned text to a CSV file (optional)
df.to_csv('stopword.csv', index=False)

In [38]:
# Initialize Snowball stemmer (less aggressive than Porter)
stemmer = SnowballStemmer('english')

In [39]:
# Apply stemming first
df['text'] = df['text'].apply(lambda x: [stemmer.stem(word) for word in x])

In [40]:
# Save the stemmed text to a CSV file (optional)
df.to_csv('stemmed_text.csv', index=False)

In [41]:
# Apply lemmatization using WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
df['text'] = df['text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

In [42]:
# Remove residual possessives (e.g., "'s" becoming "s")
df['text'] = df['text'].apply(lambda x: [word for word in x if word != 's'])

In [43]:
# Save the lemmatized text to a CSV file (optional)
df.to_csv('lemmatized_text.csv', index=False)

In [44]:
# Import TfidfVectorizer for TF-IDF calculation
from sklearn.feature_extraction.text import TfidfVectorizer

In [45]:
# Create a list of document texts
documents = df['text'].apply(lambda x: ' '.join(x)).tolist()

In [46]:
# Step 1: Create word count matrix using TfidfVectorizer
vectorizer = TfidfVectorizer(use_idf=False, norm=None, binary=False)
word_count_matrix = vectorizer.fit_transform(documents).toarray()

In [47]:
# Create DataFrame for word frequencies
word_freq_df = pd.DataFrame(word_count_matrix, columns=vectorizer.get_feature_names_out())
word_freq_df['index'] = df['index']


In [48]:
# Set 'index' as the first column in the DataFrame
word_freq_df = word_freq_df.set_index('index')

In [49]:
# Save the word frequency DataFrame to a CSV file
word_freq_df.to_csv('word_frequency.csv')

In [50]:
# Display the cleaned DataFrame
print(df.head())

    index                                               text
0  Dindex                                             [text]
1      D0  [100, best, place, visit, usa, valley, fire, s...
2      D1     [agre, ye, point, kanyewest, ye2024, joebiden]
3      D2  [2022, 12, 19t10, 00, 04, 5267368z, 50, get, h...
4      D3                           [kanyewest, suck, di̇ck]
