## 1. Generate 10,000 tweets using Transformer
I generate 1,000 tweets 10 times because I got an error when I tried to generate 10,000 tweets

In [27]:
from transformers import pipeline

generator = pipeline("text-generation", model = "gpt2")

def generate_tweets(prompt, num_tweets = 5):
    return generator(prompt, num_return_sequences = num_tweets)

prompt = "Generate a tweet about tech layoff"

synthetic_tweets = generate_tweets(prompt, 1000)

All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.
Device set to use 0


## 2. Write the generated tweets to a CSV file

In [28]:
import csv
# Write tweets to a CSV file
csv_filename = "generated_tweets.csv"
with open(csv_filename, mode="a", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    for tweet in synthetic_tweets:
        writer.writerow([tweet["generated_text"]])

## 3. Combine the generated tweets and the tweets from Twitter API last week
I also remove the prompt "Generate a tweet about tech layoff" in the generated tweets

In [10]:
import pandas as pd
import re
# Combine 2 df, one is retrived from Twitter API, one is created with Transformer
df1 = pd.read_csv('generated_tweets.csv')
df1['Tweet'] = df1['Tweet'].apply(lambda x: re.sub(r'Generate a tweet about tech layoff[^\w\s]*', '', str(x)))  
df2 = pd.read_csv('tweets.csv')
df2_tweets = df2[['text']].rename(columns={'text': 'Tweet'})
df_combined = pd.concat([df1, df2_tweets], ignore_index=True)
df_combined.to_csv('combined_tweets.csv', index=False)

## 4. Perform text preprocessing

Techinques that I use to do the text preprocessing:
- Text normalization (lowercasing)
- URL removal
- HTML tag removal
- Punctuation removal
- Whitespace management
- Alphanumeric word removal
- Tokenization
- Stopword removal
- Lemmatization


In [None]:
import re
import string
import nltk
from nltk.tokenize import WordPunctTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Download necessary NLTK data (run once)
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize the tokenizer and lemmatizer
wpt = WordPunctTokenizer()
lemmatizer = WordNetLemmatizer()

# Create a stopwords set
stop_words = set(stopwords.words('english'))

def normalized_text(text):
    if isinstance(text, str):  # Check if text is a string
        text = text.lower()
        text = re.sub(r'\[.*?\]', ' ', text)
        text = re.sub(r'https?://\S+|www\.\S+', ' ', text)
        text = re.sub(r'<.*?>+', ' ', text)
        text = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', text)
        text = re.sub(r'\n', ' ', text)
        text = re.sub(r'\w*\d\w*', ' ', text)
        tokens = wpt.tokenize(text)

        # Remove stopwords
        filtered_tokens = [token for token in tokens if token not in stop_words]

        # Lemmatize tokens without stemming
        lemma_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

        cleaned_text = ' '.join(lemma_tokens)
        return cleaned_text
    return ''  # If it's not a string, return an empty string

data = pd.read_csv("combined_tweets.csv")
data['Tweet'] = data['Tweet'].apply(normalized_text)

# Save to new csv file
data.to_csv("cleaned_tweets.csv", index=False)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tranminhanh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/tranminhanh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
