### Import the necessary libraries

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, ArrayType, StructType, StructField
from pyspark.ml.feature import Tokenizer, StopWordsRemover, StringIndexer
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
import re
import os
from nltk.corpus import wordnet
from pyspark.sql.column import Column
import string

### Configure NLTK Data Path
This cell defines the `setup_nltk_data_path` function, which ensures that the NLTK library has access to necessary data files stored in a specified directory. This setup is crucial for enabling NLTK functionalities such as tokenization, stemming, and sentiment analysis that depend on external data files.


In [0]:
def setup_nltk_data_path():

  nltk_data_path = '/dbfs/nltk_data'  
  
  if nltk_data_path not in nltk.data.path:
    
    nltk.data.path.append(nltk_data_path)

### Define Regular Expressions for Text Cleaning

This code cell defines a function `remove_pattern` to apply regular expressions for removing common text patterns like URLs, social media handles, hashtags, numeric data, punctuation, and emojis from text data. These patterns are essential for preprocessing text for machine learning


In [0]:
def remove_pattern(col: Column, pattern: str) -> Column:
    return F.regexp_replace(col, pattern, "")

url_pattern = r'http\S+|www\S+|https\S+'
handle_pattern = r'@\w+'
hashtag_pattern = r'#\w+'
numerical_pattern = r'\d+'
punctuation_pattern = r'^[^\w\s]+|[^\w\s]+$'
emoji_pattern = "[" \
                u"\U0001F600-\U0001F64F" \
                u"\U0001F300-\U0001F5FF" \
                u"\U0001F680-\U0001F6FF" \
                u"\U0001F700-\U0001F77F" \
                u"\U0001F780-\U0001F7FF" \
                u"\U0001F800-\U0001F8FF" \
                u"\U0001F900-\U0001F9FF" \
                u"\U0001FA70-\U0001FAFF" \
                u"\U00002702-\U000027B0" \
                u"\U000024C2-\U0001F251" \
                u"\U0001F1E0-\U0001F1FF" \
                "]+"

### Load and Filter Tweet Data from Parquet Files

This cell reads the dataset containing tweets from a Parquet file using a predefined schema with PySpark. It also includes filtering to remove any rows where the 'tweet' field is null, ensuring that the dataset is ready for further processing.


In [0]:
schema = StructType([
    StructField('tweet', StringType(), True),
])

df = spark.read.schema(schema).parquet('/mnt/2024-team20/cleaned_datasets_parquet')

df = df.filter(F.col('tweet').isNotNull())

### Clean Text Data by Removing URLs, Handles, Hashtags, and Other Patterns

This cell processes the tweet data by removing unwanted textual patterns such as URLs, social media handles, hashtags, numbers, punctuation, and emojis from each tweet. This cleaning is crucial for preparing the data for natural language processing tasks, improving the quality of the text analysis.


In [0]:
df_clean = df.withColumn('clean_tweets', F.col('tweet'))

df_clean = df_clean.withColumn('clean_tweets', remove_pattern(F.col('clean_tweets'), url_pattern))

df_clean = df_clean.withColumn('clean_tweets', remove_pattern(F.col('clean_tweets'), handle_pattern))

df_clean = df_clean.withColumn('clean_tweets', remove_pattern(F.col('clean_tweets'), hashtag_pattern))

df_clean = df_clean.withColumn('clean_tweets', remove_pattern(F.col('clean_tweets'), numerical_pattern))

df_clean = df_clean.withColumn('clean_tweets', remove_pattern(F.col('clean_tweets'), punctuation_pattern))

df_clean = df_clean.withColumn('clean_tweets', remove_pattern(F.col('clean_tweets'), emoji_pattern))

### Filter Tweets with Insufficient Word Count

This cell further refines the tweet dataset by filtering out tweets that contain fewer than four words. This step ensures that the remaining dataset only includes tweets with enough content to provide meaningful context for machine learning tasks.


In [0]:
df_clean = df_clean.filter(F.size(F.split(df_clean.clean_tweets, '\\s+')) > 3)

### Tokenize and Clean Tweet Tokens

This cell performs tokenization on the cleaned tweet data, splitting each tweet into individual words or 'tokens'. Following tokenization, it further cleans the tokens by removing any empty or null entries.

In [0]:
tokenizer = Tokenizer(inputCol = 'clean_tweets', outputCol = 'tokens')

df_tokens = tokenizer.transform(df_clean)

df_tokens = df_tokens.withColumn('tokens_cleaned', F.expr("filter(tokens, x -> x IS NOT NULL AND x != '')"))

### Remove Stop Words and Filter Empty Tokens

This cell removes commonly used stop words from the tokens to focus on more meaningful words in the tweets. Subsequently, any tweets that result in zero tokens after this removal are filtered out, ensuring that only tweets with meaningful content are retained.


In [0]:
remover = StopWordsRemover(inputCol = 'tokens_cleaned', outputCol = 'filtered_tokens')

df_filtered = remover.transform(df_tokens)

df_filtered = df_filtered.filter(F.size(F.col('filtered_tokens')) > 0)

### Refine Tokens by Stripping Punctuation

This cell defines and applies a custom function to further clean the tokenized text by stripping any remaining punctuation. The function `clean_array` iterates over each token, removes surrounding punctuation, and keeps only non-empty tokens. This step is essential for ensuring that the tokens are clean and meaningful.

In [0]:
def clean_array(arr):

    cleaned_arr = []

    for item in arr:

        item = item.strip(string.punctuation)

        if item:

            cleaned_arr.append(item)

    return cleaned_arr
  
clean_array_udf = udf(clean_array, ArrayType(StringType()))

df_filtered = df_filtered.withColumn('filtered_tokens', clean_array_udf(F.col('filtered_tokens')))

### Filter Tweets to Retain Only Those with Sufficient Content

This cell enhances data quality by filtering out tweets with fewer than four tokens. This step ensures that only tweets with enough contextual information are retained for in-depth analysis.

In [0]:
df_filtered = df_filtered.filter(F.size(df_filtered['filtered_tokens']) >= 4)

### Apply Lemmatization to Standardize Token Forms

This cell defines and applies a lemmatization function to the tokens to convert them into their base or dictionary forms.

In [0]:
def lemmatize_words(words):
  
  setup_nltk_data_path()
  
  lemmatizer = WordNetLemmatizer()
  
  return [lemmatizer.lemmatize(word) for word in words]

lemmatize_words_udf = udf(lemmatize_words, ArrayType(StringType()))

df_lemmatized = df_filtered.withColumn('lemmatized_tokens', lemmatize_words_udf(F.col('filtered_tokens')))

### Filter Out Empty Token Lists After Lemmatization

This cell further refines the dataset by filtering out any tweets where the `lemmatized_tokens` column contains an empty list. This step ensures that only tweets with meaningful content (i.e., at least one valid, lemmatized word) are retained.

In [0]:
df_lemmatized = df_lemmatized.filter(F.size(F.col('lemmatized_tokens')) > 0)

### Define and Apply Sentiment Analysis Function to Lemmatized Tokens

This cell creates a sentiment analysis function `analyze_sentiment` which uses NLTK's `SentimentIntensityAnalyzer` to evaluate the sentiment of lemmatized tokens. The function labels each tweet as 'positive', 'negative', or 'neutral' based on the computed sentiment score. 

In [0]:
def analyze_sentiment(tokens):
  
  setup_nltk_data_path()

  sid = SentimentIntensityAnalyzer(lexicon_file = '/dbfs/nltk_data/corpora/sentiment/vader_lexicon/vader_lexicon.txt')

  scores = sid.polarity_scores(' '.join(tokens))

  if scores['compound'] >= 0.05:

    return 'positive'
  
  elif scores['compound'] <= -0.05:

    return 'negative'
  
  else:

    return 'neutral'  
     
analyze_sentiment_udf = udf(analyze_sentiment, StringType())

df_with_sentiment = df_lemmatized.withColumn('sentiment', analyze_sentiment_udf(F.col('lemmatized_tokens')))

### Remove Unnecessary Columns

This cell identifies and removes columns that are no longer needed, including 'tweet', 'tokens', 'tokens_cleaned', and 'filtered_tokens'.

In [0]:
columns_to_drop = ['tweet', 'tokens', 'tokens_cleaned', 'filtered_tokens']

final_df = df_with_sentiment.drop(*columns_to_drop)

### Export the Final Dataset to Parquet Format

This cell saves the processed and labeled dataset to a Parquet file, overwriting any existing data at the specified location. Storing the data in Parquet format ensures efficient storage and quick access for future analysis, leveraging the columnar storage benefits to enhance data processing performance.


In [0]:
final_df.write.mode('overwrite').parquet('/mnt/2024-team20/labelled_datasets_parquet')