In [2]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Load the cleaned dataset
file_path = 'cleaned_no_numbers_davidson2017.csv'  # Update with your file path
df = pd.read_csv(file_path)

# Ensure 'tweet' column exists
if 'tweet' not in df.columns:
    raise ValueError("The column 'tweet' is not present in the dataset.")


[nltk_data] Downloading package punkt to
[nltk_data]     /storage/home/tmv5264/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /storage/home/tmv5264/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /storage/home/tmv5264/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Ensure that the 'tweet' column has no NaN or non-string values
df['tweet'] = df['tweet'].fillna('')  # Replace NaN values with an empty string

# Convert non-string values to strings, if any
df['tweet'] = df['tweet'].astype(str)

# Tokenize the 'tweet' column
df['tokenized'] = df['tweet'].apply(word_tokenize)

# Preview the tokenized column
df[['tweet', 'tokenized']].head()



Unnamed: 0,tweet,tokenized
0,rt as a woman you shouldnt complain about clea...,"[rt, as, a, woman, you, shouldnt, complain, ab..."
1,rt boy dats coldtyga dwn bad for cuffin dat ho...,"[rt, boy, dats, coldtyga, dwn, bad, for, cuffi..."
2,rt dawg rt you ever fuck a bitch and she start...,"[rt, dawg, rt, you, ever, fuck, a, bitch, and,..."
3,rt she look like a tranny,"[rt, she, look, like, a, tranny]"
4,rt the shit you hear about me might be true or...,"[rt, the, shit, you, hear, about, me, might, b..."


In [5]:
# Initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# Lemmatize each word in the tokenized tweets
df['lemmatized'] = df['tokenized'].apply(
    lambda tokens: [lemmatizer.lemmatize(token) for token in tokens]
)

# Preview the lemmatized column
df[['tokenized', 'lemmatized']].head()


Unnamed: 0,tokenized,lemmatized
0,"[rt, as, a, woman, you, shouldnt, complain, ab...","[rt, a, a, woman, you, shouldnt, complain, abo..."
1,"[rt, boy, dats, coldtyga, dwn, bad, for, cuffi...","[rt, boy, dat, coldtyga, dwn, bad, for, cuffin..."
2,"[rt, dawg, rt, you, ever, fuck, a, bitch, and,...","[rt, dawg, rt, you, ever, fuck, a, bitch, and,..."
3,"[rt, she, look, like, a, tranny]","[rt, she, look, like, a, tranny]"
4,"[rt, the, shit, you, hear, about, me, might, b...","[rt, the, shit, you, hear, about, me, might, b..."


In [6]:
# Function to handle negations
def handle_negations(tokens):
    negation_words = {"not", "no", "never", "n't"}
    negated_tokens = []
    negate = False
    for token in tokens:
        if token in negation_words:
            negate = True
            continue
        if negate:
            negated_tokens.append(f"NOT_{token}")
            negate = False
        else:
            negated_tokens.append(token)
    return negated_tokens

# Apply negation handling
df['negation_handled'] = df['lemmatized'].apply(handle_negations)

# Preview the negation-handled column
df[['lemmatized', 'negation_handled']].head()


Unnamed: 0,lemmatized,negation_handled
0,"[rt, a, a, woman, you, shouldnt, complain, abo...","[rt, a, a, woman, you, shouldnt, complain, abo..."
1,"[rt, boy, dat, coldtyga, dwn, bad, for, cuffin...","[rt, boy, dat, coldtyga, dwn, bad, for, cuffin..."
2,"[rt, dawg, rt, you, ever, fuck, a, bitch, and,...","[rt, dawg, rt, you, ever, fuck, a, bitch, and,..."
3,"[rt, she, look, like, a, tranny]","[rt, she, look, like, a, tranny]"
4,"[rt, the, shit, you, hear, about, me, might, b...","[rt, the, shit, you, hear, about, me, might, b..."


In [7]:
# Get the list of stop words
stop_words = set(stopwords.words('english'))

# Function to transform stop words into NOT_ format
def transform_stop_words(tokens):
    return [f"NOT_{token}" if token in stop_words else token for token in tokens]

# Apply the transformation
df['stop_words_transformed'] = df['negation_handled'].apply(transform_stop_words)

# Preview the stop-words transformed column
df[['negation_handled', 'stop_words_transformed']].head()


Unnamed: 0,negation_handled,stop_words_transformed
0,"[rt, a, a, woman, you, shouldnt, complain, abo...","[rt, NOT_a, NOT_a, woman, NOT_you, shouldnt, c..."
1,"[rt, boy, dat, coldtyga, dwn, bad, for, cuffin...","[rt, boy, dat, coldtyga, dwn, bad, NOT_for, cu..."
2,"[rt, dawg, rt, you, ever, fuck, a, bitch, and,...","[rt, dawg, rt, NOT_you, ever, fuck, NOT_a, bit..."
3,"[rt, she, look, like, a, tranny]","[rt, NOT_she, look, like, NOT_a, tranny]"
4,"[rt, the, shit, you, hear, about, me, might, b...","[rt, NOT_the, shit, NOT_you, hear, NOT_about, ..."


In [8]:
# Function to remove stop words
def remove_stop_words(tokens):
    return [token for token in tokens if token not in stop_words]

# Apply the function to remove stop words
df['final_tokens'] = df['stop_words_transformed'].apply(remove_stop_words)

# Preview the final cleaned tokens
df[['stop_words_transformed', 'final_tokens']].head()


Unnamed: 0,stop_words_transformed,final_tokens
0,"[rt, NOT_a, NOT_a, woman, NOT_you, shouldnt, c...","[rt, NOT_a, NOT_a, woman, NOT_you, shouldnt, c..."
1,"[rt, boy, dat, coldtyga, dwn, bad, NOT_for, cu...","[rt, boy, dat, coldtyga, dwn, bad, NOT_for, cu..."
2,"[rt, dawg, rt, NOT_you, ever, fuck, NOT_a, bit...","[rt, dawg, rt, NOT_you, ever, fuck, NOT_a, bit..."
3,"[rt, NOT_she, look, like, NOT_a, tranny]","[rt, NOT_she, look, like, NOT_a, tranny]"
4,"[rt, NOT_the, shit, NOT_you, hear, NOT_about, ...","[rt, NOT_the, shit, NOT_you, hear, NOT_about, ..."


In [9]:
# Save the cleaned dataset with final tokens
output_path = 'cleaned_final_tokens_davidson2017.csv'
df.to_csv(output_path, index=False)

print(f"Cleaned dataset saved to {output_path}")


Cleaned dataset saved to cleaned_final_tokens_davidson2017.csv
