In [2]:
# --- 3.0 Data Preparation ---

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib # Library for saving and loading Python objects

# Define the correct file path to the processed data
# The '../' moves up one directory from the 'notebooks' folder
file_path = '../data/processed/combined_data_processed.csv'
df = pd.read_csv(file_path)

# Display the first few rows to confirm the data is loaded correctly
print(df.head())

   label                                               text
0      1                             Good morning every one
1      0  TW: S AssaultActually horrified how many frien...
2      1  Thanks by has notice of me Greetings : Jossett...
3      0                its ending soon aah unhappy _EMOJI_
4      1                         My real time happy _EMOJI_


In [3]:
# Define features (X) and target (y) based on the DataFrame's columns
# X is the input data (the text of the posts)
# y is the target variable (the sentiment label)
X = df['text']
y = df['label']

# Step 1: Split the data into a training set and a temporary set (for validation and testing)
# We set test_size=0.3 to allocate 30% of the data for validation and testing.
# stratify=y ensures that the proportion of positive and negative labels is the same across all splits.
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Step 2: Split the temporary set in half to create the validation and test sets
# We set test_size=0.5 to split the 30% temporary set evenly into two 15% sets.
# stratify=y_temp ensures the label proportions are maintained in these final splits.
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

# Display the shapes of the splits to confirm the sizes and successful splitting
print(f"Training set shape: X={X_train.shape}, y={y_train.shape}")
print(f"Validation set shape: X={X_val.shape}, y={y_val.shape}")
print(f"Test set shape: X={X_test.shape}, y={y_test.shape}")

Training set shape: X=(700,), y=(700,)
Validation set shape: X=(150,), y=(150,)
Test set shape: X=(150,), y=(150,)


In [4]:
# Initialize the TF-IDF Vectorizer
# This converts the text data into a matrix of numerical features.
# max_features=1000 limits the vocabulary to the 1000 most common words, reducing complexity.
vectorizer = TfidfVectorizer(max_features=1000)

# Fit the vectorizer on the training data and transform it
# We only fit on the training data to prevent data leakage from the validation and test sets.
X_train_vec = vectorizer.fit_transform(X_train)

# Transform the validation and test data using the fitted vectorizer
# We use the same vocabulary and scaling learned from the training data.
X_val_vec = vectorizer.transform(X_val)
X_test_vec = vectorizer.transform(X_test)

# Display the final shapes of the vectorized data
print(f"Vectorized Training set shape: {X_train_vec.shape}")
print(f"Vectorized Validation set shape: {X_val_vec.shape}")
print(f"Vectorized Test set shape: {X_test_vec.shape}")

Vectorized Training set shape: (700, 1000)
Vectorized Validation set shape: (150, 1000)
Vectorized Test set shape: (150, 1000)


In [6]:
# Define the file path for saving
# The '../' moves up one directory from the 'notebooks' folder
save_path = '../models/prepared_data.pkl'

# Define a dictionary to store all the prepared data
prepared_data = {
    'X_train_vec': X_train_vec,
    'X_val_vec': X_val_vec,
    'X_test_vec': X_test_vec,
    'y_train': y_train,
    'y_val': y_val,
    'y_test': y_test,
    'vectorizer': vectorizer
}

# Save the dictionary of prepared data to a single file
joblib.dump(prepared_data, save_path)
print(f"\nPrepared data and vectorizer saved to '{save_path}'")


Prepared data and vectorizer saved to '../models/prepared_data.pkl'
