The structure for Data pre processing

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from imblearn.over_sampling import SMOTE
import nltk
from nltk.corpus import stopwords
import re

class DataPreprocessor:
    def __init__(self, df, content_column, sentiment_column):
        """
        Initialize the DataPreprocessor class with the DataFrame and relevant columns.

        Parameters:
        - df: pd.DataFrame
            The DataFrame containing the dataset.
        - content_column: str
            The name of the column containing the text content to analyze.
        - sentiment_column: str
            The name of the column containing the sentiment labels.
        """
        self.df = df
        self.content_column = content_column
        self.sentiment_column = sentiment_column
    
    def remove_similar_content(self, similarity_threshold=0.8):
        """
        Removes samples with similar content based on cosine similarity.

        Parameters:
        - similarity_threshold: float (default=0.8)
            The threshold for cosine similarity above which samples are considered similar.
        """
        tfidf = TfidfVectorizer(stop_words='english')
        tfidf_matrix = tfidf.fit_transform(self.df[self.content_column])
        cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

        indices_to_remove = set()

        for i in range(len(cosine_sim)):
            for j in range(i + 1, len(cosine_sim)):
                if cosine_sim[i, j] > similarity_threshold:
                    indices_to_remove.add(i)
                    indices_to_remove.add(j)

        self.df = self.df.drop(index=indices_to_remove).reset_index(drop=True)

    def handle_class_imbalance(self):
        """
        Handles class imbalance using SMOTE (Synthetic Minority Over-sampling Technique).
        """
        smote = SMOTE()
        X = self.df[self.content_column]
        y = self.df[self.sentiment_column]

        X_resampled, y_resampled = smote.fit_resample(X.values.reshape(-1, 1), y)

        self.df = pd.DataFrame({self.content_column: X_resampled.flatten(), self.sentiment_column: y_resampled})

    def clean_text(self):
        """
        Cleans text data by removing punctuation, stopwords, and applying lemmatization.
        """
        nltk.download('stopwords')
        stop_words = set(stopwords.words('english'))

        def clean(sentence):
            # Remove punctuation and numbers
            sentence = re.sub(r'[^a-zA-Z\s]', '', sentence)
            # Tokenize
            words = nltk.word_tokenize(sentence)
            # Remove stop words
            words = [word for word in words if word.lower() not in stop_words]
            # Lemmatize words
            lemmatizer = nltk.WordNetLemmatizer()
            words = [lemmatizer.lemmatize(word.lower()) for word in words]
            return ' '.join(words)
        
        self.df[self.content_column] = self.df[self.content_column].apply(clean)

    def preprocess(self, remove_similar=True, balance_classes=True, clean_text=True):
        """
        Performs the full preprocessing pipeline.

        Parameters:
        - remove_similar: bool (default=True)
            Whether to remove similar content.
        - balance_classes: bool (default=True)
            Whether to handle class imbalance.
        - clean_text: bool (default=True)
            Whether to clean text data.
        """
        if remove_similar:
            self.remove_similar_content()
        if balance_classes:
            self.handle_class_imbalance()
        if clean_text:
            self.clean_text()

        return self.df


### Example 

In [None]:
# Assuming your DataFrame is `df`, with 'content' as the text column and 'sentiment' as the label column
preprocessor = DataPreprocessor(df, content_column='content', sentiment_column='sentiment')

# To run the full preprocessing pipeline:
df_processed = preprocessor.preprocess()

# If you only want to run specific steps, you can pass parameters:
# For example, to remove similar content and clean text but skip class imbalance handling:
df_processed = preprocessor.preprocess(balance_classes=False)
