# Data Preprocessing

In [1]:
import pandas as pd

In [2]:
train_df = pd.read_parquet('datasets/btc_tweets_train.parquet.gzip')
test_df = pd.read_parquet('datasets/btc_tweets_test.parquet.gzip')

In [3]:
train_df = train_df.reset_index()
test_df = test_df.reset_index()

In [None]:
train_df.head()

In [None]:
#test_df.rename(columns={test_df.columns[0]: 'tweet ID'}, inplace=True)

In [6]:
def df_info(df):
    return df.shape, df.isnull().sum().sum(), df.info(), df.head()

df_info(train_df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   content    1500 non-null   object
 1   username   1500 non-null   object
 2   sentiment  1500 non-null   bool  
dtypes: bool(1), object(2)
memory usage: 25.0+ KB


((1500, 3),
 0,
 None,
                                              content      username  sentiment
 0  $Bitcoin TO $100,000 SOONER THAN YOU THINK‼️💯🙏...   BezosCrypto       True
 1  Alright I have my rares. Who else is grabbing ...   spartantc81       True
 2  Bitcoin (BTC) Targets Over $100,000 as This Im...   BezosCrypto       True
 3  📢 Xverse Web-based pool is live:\n\n•Update @x...  godfred_xcuz       True
 4  Yesterday, a Bitcoin projection was displayed ...   goddess81oo       True)

In [4]:
train_df = train_df.drop(['tweet ID', 'user_displayname', 'hashtags'], axis=1)
test_df = test_df.drop(['tweet ID', 'user_displayname', 'hashtags'], axis=1)

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from imblearn.over_sampling import SMOTE
import nltk
from nltk.corpus import stopwords
import re
import emoji
from urllib.parse import urlparse

class DataPreprocessor:
    def __init__(self, df, content_column, sentiment_column):
        """
        Initialize the DataPreprocessor class with the DataFrame and relevant columns.

        Parameters:
        - df: pd.DataFrame
            The DataFrame containing the dataset.
        - content_column: str
            The name of the column containing the text content to analyze.
        - sentiment_column: str
            The name of the column containing the sentiment labels.
        """
        self.df = df
        self.content_column = content_column
        self.sentiment_column = sentiment_column
    
    def remove_similar_content(self, similarity_threshold=0.65):
        """
        Removes samples with similar content based on cosine similarity.

        Parameters:
        - similarity_threshold: float (default=0.65)
            The threshold for cosine similarity above which samples are considered similar.
        """
        tfidf = TfidfVectorizer(stop_words='english')
        tfidf_matrix = tfidf.fit_transform(self.df[self.content_column])
        cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

        indices_to_remove = set()

        for i in range(len(cosine_sim)):
            for j in range(i + 1, len(cosine_sim)):
                if cosine_sim[i, j] > similarity_threshold:
                    indices_to_remove.add(i)
                    indices_to_remove.add(j)

        self.df = self.df.drop(index=indices_to_remove).reset_index(drop=True)
        return self.df

    def clean_text(self):
        """
        Cleans text data by removing punctuation, stopwords, and applying lemmatization.
        """
        nltk.download('stopwords')
        stop_words = set(stopwords.words('english'))

        def clean(tweet):
            # Convert emojis to text # Converts emojis to text, e.g., "😊" becomes ":smiling_face:"
            tweet = emoji.demojize(tweet, delimiters=(" ", " "))
            # Remove links
            tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet, flags=re.MULTILINE)
            # Remove punctuation and numbers
            tweet = re.sub(r'[^a-zA-Z\s]', '', tweet)
            # Tokenize
            words = nltk.word_tokenize(tweet)
            # Remove stop words
            words = [word for word in words if word.lower() not in stop_words]
            # Lemmatize words
            lemmatizer = nltk.WordNetLemmatizer()
            words = [lemmatizer.lemmatize(word.lower()) for word in words]
            return ' '.join(words)
        
        self.df['cleaned_content'] = self.df[self.content_column].apply(clean)
        return self.df
    
    def handle_class_imbalance_with_SMOTE(self):
        """
        Handles class imbalance using SMOTE (Synthetic Minority Over-sampling Technique).
        """
        
        tfidf = TfidfVectorizer(stop_words='english')
        X = tfidf.fit_transform(self.df[self.content_column])
        y = self.df[self.sentiment_column]
        
        # Apply SMOTE to the vectorized text
        smote = SMOTE(random_state=21)
        X_resampled, y_resampled = smote.fit_resample(X, y)

        self.df = pd.DataFrame(X_resampled.toarray(), columns=tfidf.get_feature_names_out())
        self.df[self.sentiment_column] = y_resampled
        return self.df

    def preprocess(self, remove_similar=True, balance_classes=True, clean_text=True):
        """
        Performs the full preprocessing pipeline.

        Parameters:
        - remove_similar: bool (default=True)
            Whether to remove similar content.
        - balance_classes: bool (default=True)
            Whether to handle class imbalance.
        - clean_text: bool (default=True)
            Whether to clean text data.
        """
        if remove_similar:
            self.remove_similar_content()
        if balance_classes:
            self.handle_class_imbalance_with_SMOTE()
        if clean_text:
            self.clean_text()

        return self.df


In [8]:
# initialize the class object for 'train' dataset
train_preprocesser = DataPreprocessor(df=train_df, content_column='content', sentiment_column='sentiment')

In [9]:
cleaned_train_df = train_preprocesser.preprocess(remove_similar=True, balance_classes=False, clean_text=True)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Diya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
# initialize the class object for 'test' dataset
test_preprocesser = DataPreprocessor(df=test_df, content_column='content', sentiment_column='sentiment')

In [11]:
cleaned_test_df = test_preprocesser.preprocess(remove_similar=True, balance_classes=False, clean_text=True)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Diya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
cleaned_train_df.shape, cleaned_test_df.shape

((1399, 4), (475, 4))

In [18]:
cleaned_train_df.head()

Unnamed: 0,content,username,sentiment,cleaned_content
0,Alright I have my rares. Who else is grabbing ...,spartantc81,True,alright rares else grabbing dogepunksbtc disco...
1,📢 Xverse Web-based pool is live:\n\n•Update @x...,godfred_xcuz,True,loudspeaker xverse webbased pool live update x...
2,"Yesterday, a Bitcoin projection was displayed ...",goddess81oo,True,yesterday bitcoin projection displayed europea...
3,Unpopular opinion:\n\nThis pump isn’t going to...,CloseSomeSayles,False,unpopular opinion pump isnt going stop enterin...
4,#Bitcoin fixes this,ShannenJPEG,True,bitcoin fix


In [23]:
cleaned_train_df['sentiment'].value_counts(), cleaned_test_df['sentiment'].value_counts()

(sentiment
 True     1125
 False     274
 Name: count, dtype: int64,
 sentiment
 True     379
 False     96
 Name: count, dtype: int64)

**NOTE: both the cleaned datasets above are still imbalanced with True values largly outnumbered than False. The imbalance needs to be taken care of by assigning class weights dring model training.**

# 1. Benchmark: vaderSentiment Sentiment Dictionary

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer