In [1]:
import html
import re
import string

import inflect
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import words, stopwords
from nltk.stem import WordNetLemmatizer
from tqdm.notebook import tqdm

In [None]:
nltk.download('wordnet')
nltk.download('omw-1.4')

In [2]:
df = pd.read_csv('chatgpt_first_month_tweets.csv')

In [3]:
stop_words = set(stopwords.words("english"))

In [4]:
# corpus of all words in the English dictionary
english_words = set(words.words())

# words that contain "nn", only doing this to fix issues with the preprocessing function
nn_words = [word for word in english_words if 'nn' in word]

In [5]:
# get the lemmatized forms of all the words
lemmatized_words = []
lemmatizer = WordNetLemmatizer()
for nn_word in tqdm(nn_words, total=len(nn_words)):
    lemmatized_words.append(lemmatizer.lemmatize(nn_word, pos='n'))

  0%|          | 0/2421 [00:00<?, ?it/s]

In [6]:
# get all other forms of all the words
all_nn_words = []
all_nn_words.extend(nn_words)
all_nn_words.extend(lemmatized_words)

engine = inflect.engine()

for lemmatized_word in tqdm(lemmatized_words, total=len(lemmatized_words)):
    all_forms_of_word = []
    all_forms_of_word.append(engine.present_participle(lemmatized_word))
    all_forms_of_word.append(engine.plural(lemmatized_word))
    all_forms_of_word.append(engine.plural_noun(lemmatized_word))
    all_forms_of_word.append(engine.plural_verb(lemmatized_word))
    all_forms_of_word.append(engine.plural_adj(lemmatized_word))
    
    all_nn_words.extend(all_forms_of_word)

  0%|          | 0/2421 [00:00<?, ?it/s]

In [25]:
def preprocess(tweet):
    # Prevent the HTML entity "&" from not being converted back to its original form
    tweet = html.unescape(tweet)
    # Remove URLs
    tweet = re.sub(r"http\S+", "", tweet)
    # Remove HTML tags
    tweet = re.sub(r"<.*?>", "", tweet)
    # Remove emoticons and emoji
    tweet = re.sub(r"[\U0001f600-\U0001f650]", "", tweet)
    # Remove punctuation
    tweet = tweet.translate(str.maketrans("", "", string.punctuation))
    # Remove stop words
    tweet = " ".join([word for word in word_tokenize(tweet.lower()) if word not in stop_words])
    # Remove hashtags
    tweet = re.sub(r"#\S+", "", tweet)
    # Remove mentions
    tweet = re.sub(r"@\S+", "", tweet)
    # Remove numerical data
    tweet = re.sub(r"[0-9]+", "", tweet)
    # Remove special characters
    tweet = re.sub(r"[^\x00-\x7F]+", "", tweet)
    # Replace 1 or more consecutive newline characters with a single whitespace
    tweet = re.sub(r"\n+", " ", tweet)
    # Get rid of "nn" between any words not in all_nn_words
    nn_regex = r"(?=.*\b" + r"\b)(?=.*\b".join(all_nn_words) + r"\b)"
    tweet = re.sub(nn_regex + "nn", " ", tweet)
    
    # Replace any occurrences of "nn" with a blank space ONLY IF
    # words containing "nn" in the tweet are not in the English dictionary
    # in other words, only get rid of the "nn" occurrences if the occurrence
    # of "nn" in the tweet is garbage, but if it is a valid word in the English
    # dictionary, then keep it.
    flag = True
    for nn_word in all_nn_words:
        if nn_word in tweet.lower():
            flag = False
            break
    if flag:
        tweet = re.sub(r"nn", " ", tweet)
        tweet = re.sub(r"(?<=\w)nn(?=\w)", " ", tweet)
    
    # Replace any occurrence of "n" surrounded by blank spaces with a blank space
    tweet = re.sub(r"\s+n\s+", " ", tweet)
    # Replace any occurrence of "n" at the very end of a line with a blank space
    tweet = re.sub(r"n\s*$", " ", tweet)
    # Remove stop words again
    tweet = " ".join([word for word in word_tokenize(tweet.lower()) if word not in stop_words])
    
    return tweet

In [26]:
# evaluate 100-400 tweets to see how the preprocessing is doing

In [27]:
for i, row in df.iterrows():
    if i > 50:
        print(row['tweet'])
        print()
        print(preprocess(row['tweet']))
        print('\n'*4)
    
    if i > 150:
        break

Am I the only one excited about the advances in the NLP field?\n\n@OpenAI ChatGPT is cool, answered superposition. I loved the answer.\n\nhttps://t.co/8uIxGtU9pF\n\n#ai #openai #chatGPT #technology #nlp

one excited advances nlp field openai chatgpt cool answered superposition loved answer openai chatgpt technology nlp





ChatGPT thwarts my many attempts at malicious JavaScript injections https://t.co/jXkC9ojUwy

chatgpt thwarts many attempts malicious javascript injections





Damn, OpenAI is at it again. Just tried this and the implications of having an assistant like ChatGPT at your disposal is revolutionary. https://t.co/QJUQMAXdMq

damn openai tried implications assistant like chatgpt disposal revolutionary





ChatGPT about @kunalb11 - English essay writing is going to go for a toss! https://t.co/8t2GKX3Lcl

chatgpt kunalb english essay writing going go toss





Me: How are you doing?\n\nChatGPT: As a language model trained by OpenAI, I don't have the ability to feel emotion

chatgpt got released ive seen future could replace google informational queries





Playing with new ChatGPT.  Super fun and impressive in many ways. But still quite far from reliable ... https://t.co/G5YbC1N1vp

playing new chatgpt super fun impressive many ways still quite far reliable





ChatGPT: Optimizing Language Models for Dialogue https://t.co/Om46Y5XXFV https://t.co/KQ706YGKKF

chatgpt optimizing language models dialogue





ChatGPT: Optimizing Language Models for Dialogue https://t.co/EdhSl0qnWA https://t.co/F2A7gQHJVq

chatgpt optimizing language models dialogue





This is what ChatGPT AI by @OpenAI has to say about Tinubu's presidential aspiration despite his frail health and history with corruption. https://t.co/aLt4SUYzvP

chatgpt ai openai say tinubus presidential aspiration despite frail health history corruptio





OpenAI ChatGPT: Optimizing language models for dialogue https://t.co/K8vygik2Tz https://t.co/pzFs939Ck0

openai chatgpt optimizing language models di