In [None]:
# Import libraries
import pandas as pd
import joblib
import re
import nltk
import os

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

# Download NLTK resources if needed
# nltk.download('wordnet', download_dir="/kaggle/working/")
# nltk.download('stopwords', download_dir="/kaggle/working/")
# nltk.download('punkt', download_dir="/kaggle/working/")

nltk.download('wordnet')
nltk.download('omw-1.4')  # For wordnet language support if needed
nltk.download('stopwords')
# nltk.data.path.append("/kaggle/working/nltk")
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/
!unzip /usr/share/nltk_data/corpora/omw-1.4.zip -d /usr/share/nltk_data/corpora/

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

In [None]:
# Load the Sentiment140 dataset
df = pd.DataFrame()

data_path = '/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv'
try:
    df = pd.read_csv(data_path, encoding='latin-1', header=None)
    df.columns = ['polarity', 'id', 'date', 'query', 'user', 'text']  # Rename columns

    # Filter out neutral polarity
    df = df[df['polarity'] != 2]

    # Map polarity to binary labels: 0 = negative, 1 = positive
    df['polarity'] = df['polarity'].map({0: 0, 4: 1})

    # Drop unnecessary columns
    df = df[['polarity', 'text']]

    print(df.head())  # Preview dataset

except Exception as e:
    print(f"Error loading dataset: {e}")

In [None]:
df.info()

In [None]:
# Preprocessing text

# Fetch emoji and sentiment from csv
def load_emoji_sentiment(csv_file):
    emoji_df = pd.read_csv(csv_file)
    emoji_sentiment_dict = dict(zip(emoji_df['Emoji'], emoji_df['Sentiment']))
    return emoji_sentiment_dict

# Load the emoji sentiment dictionary at runtime
emoji_sentiment_dict = load_emoji_sentiment('/kaggle/input/emoji-with-sentiments/emoji_sentiment.csv')

def replace_emojis(text):
    for emoji, replacement in emoji_sentiment_dict.items():
        text = text.replace(emoji, replacement)
    return text

def remove_short_words(text):
    return ' '.join([word for word in text.split() if len(word) >= 2])

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stop_words])

def apply_stemming(text):
    return ' '.join([stemmer.stem(word) for word in text.split()])

def apply_lemmatization(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])


def clean_text(text):
    text = text.lower()  # Convert text to lowercase
    text = replace_emojis(text)  # Replace emojis with their meaningful text
    text = re.sub(r'http\S+', 'url', text)  # Replace urls with 'url'
    text = re.sub(r'\b\w*@\w*\.\w*\b', 'email', text)  # Replace email addresses with 'email'
    text = re.sub(r'@\w+', 'user', text)  # Replace user-mentions with 'user'
    text = re.sub(r'#', '', text)  # Remove hashtag symbols
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)  # Remove repeated characters
    text = re.sub(r'\b(\w+)( \1\b)+', r'\1', text)  # Remove consecutive duplicate words

    # Handle repeated words without spaces
    text = re.sub(r'(\b\w+)\1+', r'\1', text)

    # Reduce consecutive duplicates
    text = re.sub(r'(\b\w+)(\1)+', r'\1', text)  # Reduce repeated words
    
    text = remove_short_words(text)  # Remove short words
    text = remove_stopwords(text)   # Remove stopwords
    text = apply_stemming(text)  # Apply stemming
    text = apply_lemmatization(text)  # Optional: apply both or just one
    
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    
    return text


df['text'] = df['text'].apply(clean_text)

df.head()

In [None]:
# Save the cleaned dataset to CSV
df.to_csv('cleaned_stemmed_lemmatized_sentiment140.csv', index=False, encoding='utf-8')

In [None]:
# Save the cleaned dataset to Parquet
df.to_parquet('cleaned_stemmed_lemmatized_sentiment140.parquet', index=False)

In [None]:
# Save the DataFrame using joblib
joblib.dump(df, 'cleaned_stemmed_lemmatized_sentiment140.joblib')