# Data Preprocessing

In [1]:
import pandas as pd
import numpy as np

In [3]:
train_df = pd.read_parquet('datasets/btc_tweets_train.parquet.gzip')
test_df = pd.read_parquet('datasets/btc_tweets_test.parquet.gzip')

In [4]:
train_df = train_df.reset_index()

In [None]:
type(train_df)

In [None]:
def df_info(df):
    return df.shape, df.isnull().sum().sum(), df.info(), df.head()

In [None]:
df_info(train_df)

In [5]:
train_df = train_df.drop(['tweet ID', 'user_displayname'], axis=1)

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from imblearn.over_sampling import SMOTE
import nltk
from nltk.corpus import stopwords
import re
import emoji
from urllib.parse import urlparse

class DataPreprocessor:
    def __init__(self, df, content_column, sentiment_column):
        """
        Initialize the DataPreprocessor class with the DataFrame and relevant columns.

        Parameters:
        - df: pd.DataFrame
            The DataFrame containing the dataset.
        - content_column: str
            The name of the column containing the text content to analyze.
        - sentiment_column: str
            The name of the column containing the sentiment labels.
        """
        self.df = df
        self.content_column = content_column
        self.sentiment_column = sentiment_column
    
    def remove_similar_content(self, similarity_threshold=0.65):
        """
        Removes samples with similar content based on cosine similarity.

        Parameters:
        - similarity_threshold: float (default=0.8)
            The threshold for cosine similarity above which samples are considered similar.
        """
        tfidf = TfidfVectorizer(stop_words='english')
        tfidf_matrix = tfidf.fit_transform(self.df[self.content_column])
        cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

        indices_to_remove = set()

        for i in range(len(cosine_sim)):
            for j in range(i + 1, len(cosine_sim)):
                if cosine_sim[i, j] > similarity_threshold:
                    indices_to_remove.add(i)
                    indices_to_remove.add(j)

        self.df = self.df.drop(index=indices_to_remove).reset_index(drop=True)
        return self.df

    def handle_class_imbalance_with_SMOTE(self):
        """
        Handles class imbalance using SMOTE (Synthetic Minority Over-sampling Technique).
        """
        
        tfidf = TfidfVectorizer(stop_words='english')
        X = tfidf.fit_transform(self.df[self.content_column])
        y = self.df[self.sentiment_column]
        
        # Apply SMOTE to the vectorized text
        smote = SMOTE(random_state=21)
        X_resampled, y_resampled = smote.fit_resample(X, y)

        self.df = pd.DataFrame(X_resampled.toarray(), columns=tfidf.get_feature_names_out())
        self.df[self.sentiment_column] = y_resampled
        return self.df

    def clean_text(self):
        """
        Cleans text data by removing punctuation, stopwords, and applying lemmatization.
        """
        nltk.download('stopwords')
        stop_words = set(stopwords.words('english'))

        def clean(tweet):
            # Convert emojis to text # Converts emojis to text, e.g., "😊" becomes ":smiling_face:"
            tweet = emoji.demojize(tweet, delimiters=(" ", " "))
            # Remove links
            tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet, flags=re.MULTILINE)
            # Remove punctuation and numbers
            tweet = re.sub(r'[^a-zA-Z\s]', '', tweet)
            # Tokenize
            words = nltk.word_tokenize(tweet)
            # Remove stop words
            words = [word for word in words if word.lower() not in stop_words]
            # Lemmatize words
            lemmatizer = nltk.WordNetLemmatizer()
            words = [lemmatizer.lemmatize(word.lower()) for word in words]
            return ' '.join(words)
        
        self.df['cleaned_content'] = self.df[self.content_column].apply(clean)
        return self.df

    def preprocess(self, remove_similar=True, balance_classes=True, clean_text=True):
        """
        Performs the full preprocessing pipeline.

        Parameters:
        - remove_similar: bool (default=True)
            Whether to remove similar content.
        - balance_classes: bool (default=True)
            Whether to handle class imbalance.
        - clean_text: bool (default=True)
            Whether to clean text data.
        """
        if remove_similar:
            self.remove_similar_content()
        if balance_classes:
            self.handle_class_imbalance_with_SMOTE()
        if clean_text:
            self.clean_text()

        return self.df


In [7]:
# initialized the class object
preprocesser = DataPreprocessor(df=train_df, content_column='content', sentiment_column='sentiment')

In [8]:
bot_removed_train_df = preprocesser.preprocess(remove_similar=True, balance_classes=False, clean_text=False)

In [9]:
type(bot_removed_train_df)

pandas.core.frame.DataFrame

In [10]:
bot_removed_train_df['sentiment'].value_counts()

sentiment
True     1125
False     274
Name: count, dtype: int64

In [15]:
bot_removed_train_df.head(20)

Unnamed: 0,hashtags,content,username,sentiment
0,"[Bitcoin, bitcoinordinals, crypto]",Alright I have my rares. Who else is grabbing ...,spartantc81,True
1,[BTC],📢 Xverse Web-based pool is live:\n\n•Update @x...,godfred_xcuz,True
2,[Bitcoin],"Yesterday, a Bitcoin projection was displayed ...",goddess81oo,True
3,"[Crypto, Bitcoin, Investing]",Unpopular opinion:\n\nThis pump isn’t going to...,CloseSomeSayles,False
4,[Bitcoin],#Bitcoin fixes this,ShannenJPEG,True
5,[Bitcoin],Solid bid in major ALT/BTC pairs today. \n\nIf...,tedtalksmacro,True
6,[bitcoin],"If you're filing 2022 taxes in the U.S., you s...",unchainedcom,True
7,"[ElizabethWarren, crypto, cryptocurrency, bitc...",🚨ELIZABETH WARREN ANTI CRYPTO CAMPAIGN &amp; M...,ThinkingCrypto1,True
8,[Bitcoin],#Bitcoin Somethig different https://t.co/UIaFI...,sailortrades,True
9,"[Silver, Gold, Bitcoin, IStandWithTrump]",They want riots. Give them bank runs instead.\...,ShoreProgress,False


In [23]:
text_cleaned_preprocesser = DataPreprocessor(df=bot_removed_train_df, content_column='content', sentiment_column='sentiment')

In [16]:
text_cleaned_train_df = text_cleaned_preprocesser.preprocess(remove_similar=False, balance_classes=False, clean_text=True)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Diya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [29]:
text_cleaned_train_df.head(20)

Unnamed: 0,hashtags,content,username,sentiment,cleaned_content
0,"[Bitcoin, bitcoinordinals, crypto]",Alright I have my rares. Who else is grabbing ...,spartantc81,True,alright rares else grabbing dogepunksbtc disco...
1,[BTC],📢 Xverse Web-based pool is live:\n\n•Update @x...,godfred_xcuz,True,loudspeaker xverse webbased pool live update x...
2,[Bitcoin],"Yesterday, a Bitcoin projection was displayed ...",goddess81oo,True,yesterday bitcoin projection displayed europea...
3,"[Crypto, Bitcoin, Investing]",Unpopular opinion:\n\nThis pump isn’t going to...,CloseSomeSayles,False,unpopular opinion pump isnt going stop enterin...
4,[Bitcoin],#Bitcoin fixes this,ShannenJPEG,True,bitcoin fix
5,[Bitcoin],Solid bid in major ALT/BTC pairs today. \n\nIf...,tedtalksmacro,True,solid bid major altbtc pair today bitcoin cont...
6,[bitcoin],"If you're filing 2022 taxes in the U.S., you s...",unchainedcom,True,youre filing tax u educate tax implication own...
7,"[ElizabethWarren, crypto, cryptocurrency, bitc...",🚨ELIZABETH WARREN ANTI CRYPTO CAMPAIGN &amp; M...,ThinkingCrypto1,True,policecarlight elizabeth warren anti crypto ca...
8,[Bitcoin],#Bitcoin Somethig different https://t.co/UIaFI...,sailortrades,True,bitcoin somethig different
9,"[Silver, Gold, Bitcoin, IStandWithTrump]",They want riots. Give them bank runs instead.\...,ShoreProgress,False,want riot give bank run instead silver gold bi...


In [20]:
index = 3
text_cleaned_train_df['content'].iloc[index], text_cleaned_train_df['cleaned_content'].iloc[index]

('Unpopular opinion:\n\nThis pump isn’t going to stop and we are entering a full fledged bull run.\n\nHOWEVER, there will be a major sell off sometime between April and June next year.\n#Crypto #Bitcoin #Investing https://t.co/4fOMtYvY3c',
 'unpopular opinion pump isnt going stop entering full fledged bull run however major sell sometime april june next year crypto bitcoin investing')

In [24]:
balanced_train_df = text_cleaned_preprocesser.preprocess(remove_similar=False, balance_classes=True, clean_text=False)

In [28]:
balanced_train_df.shape

(2250, 7707)

# 1. Benchmark: vaderSentiment Sentiment Dictionary

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer