In [4]:
#Import libraries
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
import pickle

In [None]:
#Download NLTK data files
nltk.download('stopwords')
nltk.download('wordnet')

In [79]:
#Import datasets
twitter_train = pd.read_csv('raw data\\twitter_training.csv', header=None, names=['Tweet id','topic', 'sentiment','Tweet content'])
twitter_val = pd.read_csv('raw data\\twitter_validation.csv', header=None, names=['Tweet id','topic', 'sentiment','Tweet content'])
twitter = pd.concat([twitter_train, twitter_val], ignore_index=True)
emotion_text = pd.read_csv('raw data\\tweet_emotions.csv')
youtube_comments = pd.read_csv('raw data\\YoutubeCommentsDataSet.csv')

In [80]:
#Clean data (duplicates, missing values, uniform text, etc.)
def clean_text(text):
    if isinstance(text, str):
        # Convert to lowercase
        text = text.lower()
        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text)
        # Remove user mentions and hashtags
        text = re.sub(r'@\w+|#\w+', '', text)
        # Remove punctuation and special characters
        text = re.sub(r'[^\w\s]', '', text)
        # Remove numbers
        text = re.sub(r'\d+', '', text)
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        
        # Remove stopwords and lemmatize
        stop_words = set(stopwords.words('english'))
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]
        
        return ' '.join(words)
    return ''

In [81]:
#Drop irrelevant rows
twitter = twitter[twitter['sentiment'] != 'Irrelevant']

In [82]:
#Map youtube labels
capital = {
    'positive': 'Positive',
    'negative': 'Negative',
    'neutral': 'Neutral'
}
youtube_comments['Sentiment'] = youtube_comments['Sentiment'].map(capital)

In [83]:
#Map emotions to uniform data
emotion_to_sentiment = {
    'enthusiasm': 'Positive',
    'surprise': 'Positive',
    'love': 'Positive',
    'fun': 'Positive',
    'happiness': 'Positive',
    'neutral': 'Neutral',
    'relief': 'Positive',
    'anger': 'Negative',
    'boredom': 'Negative',
    'hate': 'Negative',
    'worry': 'Negative',
    'sadness': 'Negative',
    'empty': 'Negative'
}

# Apply mapping to emotion dataset
emotion_text['sentiment'] = emotion_text['sentiment'].map(emotion_to_sentiment)

In [84]:
#Preprocess data
# Twitter dataset
twitter['cleaned_text'] = twitter['Tweet content'].apply(clean_text)
twitter_subset = twitter[['cleaned_text', 'sentiment']]

# YouTube dataset
youtube_comments['cleaned_text'] = youtube_comments['Comment'].apply(clean_text)
youtube_subset = youtube_comments[['cleaned_text', 'Sentiment']]
youtube_subset = youtube_subset.rename(columns={'Sentiment': 'sentiment'})

# Emotion dataset
emotion_text['cleaned_text'] = emotion_text['content'].apply(clean_text)
emotion_subset = emotion_text[['cleaned_text', 'sentiment']]

In [85]:
#Concat data into one dataset
combined_data = pd.concat([twitter_subset, youtube_subset, emotion_subset], ignore_index=True)
combined_data = combined_data.dropna()

In [86]:
#drop duplicates
combined_data = combined_data.drop_duplicates()

In [87]:
#Encode data
label_encoder = LabelEncoder()
combined_data['sentiment_encoded'] = label_encoder.fit_transform(combined_data['sentiment'])


In [88]:
# Print the mapping for reference
sentiment_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Sentiment mapping:", sentiment_mapping)

Sentiment mapping: {'Negative': np.int64(0), 'Neutral': np.int64(1), 'Positive': np.int64(2)}


In [89]:
#Save data
combined_data.to_csv('combined_sentiment_data.csv', index=False)

In [90]:
#Save the label encoder for future use
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

In [91]:
combined_data.head()

Unnamed: 0,cleaned_text,sentiment,sentiment_encoded
0,im getting borderland murder,Positive,2
1,coming border kill,Positive,2
2,im getting borderland kill,Positive,2
3,im coming borderland murder,Positive,2
6,spent hour making something fun dont know huge...,Positive,2
