The structure for Data pre processing

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from imblearn.over_sampling import SMOTE
import nltk
from nltk.corpus import stopwords
import re

class DataPreprocessor:
    def __init__(self, df, content_column, sentiment_column):
        """
        Initialize the DataPreprocessor class with the DataFrame and relevant columns.

        Parameters:
        - df: pd.DataFrame
            The DataFrame containing the dataset.
        - content_column: str
            The name of the column containing the text content to analyze.
        - sentiment_column: str
            The name of the column containing the sentiment labels.
        """
        self.df = df
        self.content_column = content_column
        self.sentiment_column = sentiment_column
    
    def remove_similar_content(self, similarity_threshold=0.8):
        """
        Removes samples with similar content based on cosine similarity.

        Parameters:
        - similarity_threshold: float (default=0.8)
            The threshold for cosine similarity above which samples are considered similar.
        """
        tfidf = TfidfVectorizer(stop_words='english')
        tfidf_matrix = tfidf.fit_transform(self.df[self.content_column])
        cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

        indices_to_remove = set()

        for i in range(len(cosine_sim)):
            for j in range(i + 1, len(cosine_sim)):
                if cosine_sim[i, j] > similarity_threshold:
                    indices_to_remove.add(i)
                    indices_to_remove.add(j)

        self.df = self.df.drop(index=indices_to_remove).reset_index(drop=True)

    def handle_class_imbalance(self):
        """
        Handles class imbalance using SMOTE (Synthetic Minority Over-sampling Technique).
        """
        smote = SMOTE()
        X = self.df[self.content_column]
        y = self.df[self.sentiment_column]

        X_resampled, y_resampled = smote.fit_resample(X.values.reshape(-1, 1), y)

        self.df = pd.DataFrame({self.content_column: X_resampled.flatten(), self.sentiment_column: y_resampled})

    def clean_text(self):
        """
        Cleans text data by removing punctuation, stopwords, and applying lemmatization.
        """
        nltk.download('stopwords')
        stop_words = set(stopwords.words('english'))

        def clean(sentence):
            # Remove punctuation and numbers
            sentence = re.sub(r'[^a-zA-Z\s]', '', sentence)
            # Tokenize
            words = nltk.word_tokenize(sentence)
            # Remove stop words
            words = [word for word in words if word.lower() not in stop_words]
            # Lemmatize words
            lemmatizer = nltk.WordNetLemmatizer()
            words = [lemmatizer.lemmatize(word.lower()) for word in words]
            return ' '.join(words)
        
        self.df[self.content_column] = self.df[self.content_column].apply(clean)

    def preprocess(self, remove_similar=True, balance_classes=True, clean_text=True):
        """
        Performs the full preprocessing pipeline.

        Parameters:
        - remove_similar: bool (default=True)
            Whether to remove similar content.
        - balance_classes: bool (default=True)
            Whether to handle class imbalance.
        - clean_text: bool (default=True)
            Whether to clean text data.
        """
        if remove_similar:
            self.remove_similar_content()
        if balance_classes:
            self.handle_class_imbalance()
        if clean_text:
            self.clean_text()

        return self.df


### Example 

In [None]:
# Assuming your DataFrame is `df`, with 'content' as the text column and 'sentiment' as the label column
preprocessor = DataPreprocessor(df, content_column='content', sentiment_column='sentiment')

# To run the full preprocessing pipeline:
df_processed = preprocessor.preprocess()

# If you only want to run specific steps, you can pass parameters:
# For example, to remove similar content and clean text but skip class imbalance handling:
df_processed = preprocessor.preprocess(balance_classes=False)


In [None]:
### COPIED CELL

class DataPreprocessor:
    def __init__(self, df, content_column, sentiment_column):
        """
        Initialize the DataPreprocessor class with the DataFrame and relevant columns.

        Parameters:
        - df: pd.DataFrame
            The DataFrame containing the dataset.
        - content_column: str
            The name of the column containing the text content to analyze.
        - sentiment_column: str
            The name of the column containing the sentiment labels.
        """
        self.df = df
        self.content_column = content_column
        self.sentiment_column = sentiment_column
    
    def remove_spam_content(self, df, content_column, sentiment_column, similarity_threshold=0.65):
        """
        Removes samples with similar content based on cosine similarity.

        Parameters:
        - similarity_threshold: float (default=0.65)
            The threshold for cosine similarity above which samples are considered similar.
        """
        tfidf = TfidfVectorizer(stop_words='english')
        tfidf_matrix = tfidf.fit_transform(self.df[self.content_column])
        cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

        indices_to_remove = set()

        for i in range(len(cosine_sim)):
            for j in range(i + 1, len(cosine_sim)):
                if cosine_sim[i, j] > similarity_threshold:
                    indices_to_remove.add(i)
                    indices_to_remove.add(j)

        self.df = self.df.drop(index=indices_to_remove).reset_index(drop=True)
        return self.df
    
    def extract_hashtags(self, text):
        """Extract all hashtags from the text."""
        return re.findall(r'#\w+', text)

    def create_hashtag_whitelist(self, top_n=30):
        """Create a whitelist of the top N most frequent hashtags."""
        # Extract all hashtags from the dataset
        all_hashtags = sum(self.df[self.content_column].apply(self.extract_hashtags), [])
        # Count the occurrences of each hashtag
        hashtag_counts = Counter(all_hashtags)
        # Get the top N hashtags
        top_hashtags = [hashtag for hashtag, count in hashtag_counts.most_common(top_n)]
        return top_hashtags

    def remove_hashtags(self, top_n=30):
        """Remove non-whitelisted hashtags from the text."""
        # Create the whitelist of top N hashtags
        whitelist = self.create_hashtag_whitelist(top_n=top_n)
        
        def clean_hashtags(tweet):
            hashtags = self.extract_hashtags(tweet)
            # Retain only hashtags in the whitelist
            for hashtag in hashtags:
                if hashtag not in whitelist:
                    tweet = tweet.replace(hashtag, '')
            return tweet
        
        # Apply the cleaning function to the dataframe
        self.df['hashtag_removed'] = self.df[self.content_column].apply(clean_hashtags)
        return self.df    
        
    # def remove_hashtags(self):
    #     """
    #     Removes hashtags from the text
    #     """
    #     def clean_hashtags(tweet):
    #         return re.sub(r'#\w+', '', tweet)
    
    #     self.df['removed_hashtag'] = self.df[self.content_column].apply(clean_hashtags)
    #     return self.df

    def clean_text(self):
        """
        Cleans text data by removing punctuation, stopwords, and applying lemmatization.
        """
        nltk.download('stopwords')
        stop_words = set(stopwords.words('english'))

        def clean(tweet):
            # Convert emojis to text # Converts emojis to text, e.g., "😊" becomes ":smiling_face:"
            tweet = emoji.demojize(tweet, delimiters=(" ", " "))
            # Remove links
            tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet, flags=re.MULTILINE)
            # Remove punctuation and numbers
            tweet = re.sub(r'[^a-zA-Z\s]', '', tweet)
            # Tokenize
            words = nltk.word_tokenize(tweet)
            # Remove stop words
            words = [word for word in words if word.lower() not in stop_words]
            # Lemmatize words
            lemmatizer = nltk.WordNetLemmatizer()
            words = [lemmatizer.lemmatize(word.lower()) for word in words]
            return ' '.join(words)
        
        self.df['cleaned_content'] = self.df[self.content_column].apply(clean)
        return self.df
    
    def handle_class_imbalance_with_SMOTE(self):
        """
        Handles class imbalance using SMOTE (Synthetic Minority Over-sampling Technique).
        """
        
        tfidf = TfidfVectorizer(stop_words='english')
        X = tfidf.fit_transform(self.df[self.content_column])
        y = self.df[self.sentiment_column]
        
        # Apply SMOTE to the vectorized text
        smote = SMOTE(random_state=21)
        X_resampled, y_resampled = smote.fit_resample(X, y)

        self.df = pd.DataFrame(X_resampled.toarray(), columns=tfidf.get_feature_names_out())
        self.df[self.sentiment_column] = y_resampled
        return self.df

    def preprocess(self, remove_spam=False, remove_hashtags=False, clean_text=False, balance_classes=False):
        """
        Performs the full preprocessing pipeline.

        Parameters:
        - remove_similar: bool (default=True)
            Whether to remove similar content.
        - balance_classes: bool (default=True)
            Whether to handle class imbalance.
        - clean_text: bool (default=True)
            Whether to clean text data.
        """
        if remove_spam:
            self.remove_spam_content()
        if remove_hashtags:
            self.remove_hashtags()
        if balance_classes:
            self.handle_class_imbalance_with_SMOTE()
        if clean_text:
            self.clean_text()

        return self.df
