Text Normalization for the tweets in the db with use of GPU (only Tue laptop)

To make use of the Gpu:
*** Create anaconda envirorment ***
1- install cuda toolkit 12.1
2- install Pytorch cuda dependency
3- install nltk

In [9]:
import nltk
from nltk.corpus import stopwords
import sqlite3
import torch
import cupy as cp
import hashlib
import spacy
import emoji
import re
import time
from tqdm import tqdm
import os
import math

# Connect to the SQLite database
conn = sqlite3.connect(r'C:\Users\20232788\Desktop\DBL-1\tweets.db')
cursor = conn.cursor()

#check wether gpu is recognized and show info
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU found")
print(torch.__version__)
print(torch.version.cuda)
print("cuDNN Version:", torch.backends.cudnn.version())
torch.cuda.empty_cache()

True
NVIDIA RTX A1000 Laptop GPU
2.3.0+cu121
12.1
cuDNN Version: 8801


In [3]:
# Download the model within the Python script
spacy.cli.download("en_core_web_trf")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_trf')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [2]:
# Redownload stopwords to ensure they are in the correct path
nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\20232788/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\20232788/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# Execute the query to fetch id and text
query = """
SELECT
    t.id_str,
    CASE
        WHEN t.truncated = 0 THEN t.text
        ELSE et.full_text
    END AS text
FROM
    tweets t
LEFT JOIN
    extended_tweets et ON t.id_str = et.id_str
WHERE
    t.lang = 'en';
"""
cursor.execute(query)
tweets = cursor.fetchall()

In [4]:
#Create new table for normalized text if it doesn't exist
create_table_query = """
CREATE TABLE emb_tweet_v2 (
    id_str TEXT PRIMARY KEY,
    norm_tweets TEXT
);
"""
cursor.execute(create_table_query)

OperationalError: table emb_tweet_v2 already exists

Cleaning of non relevant text for sentiment analysis

In [4]:
def make_lowercase(text):
    return text.lower()
    
def give_emoji_free_text(text):
    """
    Removes emoji's from tweets
    Accepts:
        Text (tweets)
    Returns:
        Text (emoji free tweets)
    """
    emoji_list = [c for c in text if c in emoji.EMOJI_DATA]
    clean_text = ' '.join([word for word in text.split() if not any(char in emoji_list for char in word)])
    return clean_text

def url_free_text(text):
    '''
    Cleans text from urls
    '''
    text = re.sub(r'http\S+', '', text)
    return text

def get_rid_of_mentions(text):
    """
    Removes mentions from tweets except for a specified list
    Accepts:
        Text (tweets)
    Returns:
        Text (tweets without unwanted mentions)
    """
    allowed_mentions = {
        'klm', 'airfrance', 'british_airways', 'americanair', 'lufthansa',
        'airberlin', 'easyjet', 'ryanair', 'singaporeair', 'qantas',
        'etihadairways', 'virginatlantic'
    }
    
    def mention_filter(match):
        mention = match.group(0)[1:]  # Remove the '@' and convert to lower case
        return match.group(0) if mention in allowed_mentions else ''
    
    return re.sub(r'@\w+', mention_filter, text)

def remove_rt_prefix(text):
    """
    Removes the 'RT' prefix from tweets
    Accepts:
        Text (tweets)
    Returns:
        Text (tweets without the 'RT' prefix)
    """
    return text.lstrip('RT ').lstrip('rt ')

In [5]:
processed_tweets = [
    (tweet_id, make_lowercase(give_emoji_free_text(url_free_text(get_rid_of_mentions(remove_rt_prefix(text))))))
    for tweet_id, text in tweets
]

In [6]:
len(processed_tweets)

4486241

In [21]:
query = """
SELECT count(id_str) FROM tweets
"""
cursor.execute(query)
emb_count = cursor.fetchall()
print(emb_count)

[(4413045,)]


Text Normalization (Tokenization, POS tagging, and lemmatization)

In [19]:
# Load the SpaCy model with GPU support and minibatch for faster approach
spacy.prefer_gpu()
spacy.require_gpu()
nlp = spacy.load("en_core_web_trf", disable=["parser", "ner"])


start_time = time.perf_counter()

# Define the beginning and ending boundaries
begin = 2756954  # Starting index
end = 2800000  # Ending index

tweets_to_process = processed_tweets[begin:]

num_batches = math.ceil(len(tweets_to_process) / 1024)

commit_threshold = 10000  # Commit after every 1000 records
records = []

# Initialize tqdm progress bar for the batches
with tqdm(total=len(tweets_to_process), desc="Processing tweets", unit="tweet") as progress_bar:
    # Process the tweets in minibatches
    for batch in spacy.util.minibatch(tweets_to_process, size=50):
        # Extract tweet IDs and texts
        tweet_ids, texts = zip(*batch)
        
        # Use nlp.pipe to process each batch of texts on the GPU
        docs = list(nlp.pipe(texts, batch_size=1024))
        
        for doc, tweet_id in zip(docs, tweet_ids):
            lemmas = [token.lemma_ for token in doc if token.lemma_ not in stop_words and not token.is_punct]
            normalized_text = ' '.join(lemmas)
            
            # Append the tweet ID and normalized text as a tuple
            records.append((tweet_id, normalized_text))
            
            # Update the progress bar
            progress_bar.update(1)
        
        # Check if the threshold is reached
        if len(records) >= commit_threshold:
            # Perform batch insert
            cursor.executemany("INSERT OR REPLACE INTO emb_tweet_v2 (id_str, norm_tweets) VALUES (?, ?)", records)
            conn.commit()
            records = []  # Reset the records list

 #Insert any remaining records that didnt meet the threshold
if records:
    cursor.executemany("INSERT OR REPLACE INTO emb_tweet_v2 (id_str, norm_tweets) VALUES (?, ?)", records)
    conn.commit()

end_time = time.perf_counter()
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time} seconds")

Processing tweets: 100%|███████████████████████████████████████████████| 1729287/1729287 [2:58:27<00:00, 161.50tweet/s]

Elapsed time: 10707.4781262 seconds





In [22]:
conn.commit()
conn.close()