In [4]:
import pandas as pd

# Sentiment140: columns = [target, ids, date, flag, user, text]
# Added on_bad_lines='skip' to handle rows with parsing errors
# Changed engine to 'python' for potentially better handling of complex errors
df_twitter = pd.read_csv('/content/TwitterDataSet.csv',
                         encoding='latin-1', header=None, on_bad_lines='skip', engine='python')
df_twitter.columns = ['target', 'ids', 'date', 'flag', 'user', 'text']

# Map sentiment labels: 0 = negative, 4 = positive
df_twitter = df_twitter[df_twitter['target'].isin([0,4])]
df_twitter['sentiment'] = df_twitter['target'].map({0: 0, 4: 1})

# Use a subset for demonstration (PSO is slow on full data)
df_twitter = df_twitter.sample(10000, random_state=42)

In [6]:
df_imdb = pd.read_csv('/content/IMDBDataset.csv')
df_imdb['sentiment'] = df_imdb['sentiment'].map({'negative': 0, 'positive': 1})

# Use a subset for demonstration
df_imdb = df_imdb.sample(10000, random_state=42)


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
import re

def clean_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'\@w+|\#','', text)
    text = re.sub(r"[^A-Za-z0-9\s]", "", text)
    return text.lower()

# Apply cleaning
df_twitter['clean_text'] = df_twitter['text'].apply(clean_text)
df_imdb['clean_text'] = df_imdb['review'].apply(clean_text)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X_twitter = vectorizer.fit_transform(df_twitter['clean_text']).toarray()
y_twitter = df_twitter['sentiment'].values

X_imdb = vectorizer.fit_transform(df_imdb['clean_text']).toarray()
y_imdb = df_imdb['sentiment'].values


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Twitter
X_train, X_test, y_train, y_test = train_test_split(X_twitter, y_twitter, test_size=0.2, random_state=42)
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Twitter Sentiment140 Results:")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

# IMDB
X_train, X_test, y_train, y_test = train_test_split(X_imdb, y_imdb, test_size=0.2, random_state=42)
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("IMDB Results:")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))


Twitter Sentiment140 Results:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      1877
           1       0.75      0.02      0.05       123

    accuracy                           0.94      2000
   macro avg       0.84      0.51      0.51      2000
weighted avg       0.93      0.94      0.91      2000

Accuracy: 0.9395
IMDB Results:
              precision    recall  f1-score   support

           0       0.87      0.82      0.85       999
           1       0.83      0.88      0.86      1001

    accuracy                           0.85      2000
   macro avg       0.85      0.85      0.85      2000
weighted avg       0.85      0.85      0.85      2000

Accuracy: 0.852


In [9]:
from pyswarm import pso
import numpy as np

# Define a simple fitness function for PSO
def fitness_func(selected_features):
    selected = np.where(selected_features > 0.5)[0]
    if len(selected) == 0:
        return 1  # Penalize empty feature set
    X_sel = X_train[:, selected]
    X_val = X_test[:, selected]
    model = LogisticRegression(max_iter=200)
    model.fit(X_sel, y_train)
    y_pred = model.predict(X_val)
    return 1 - accuracy_score(y_test, y_pred)  # Minimize error

dim = X_train.shape[1]
lb = [0]*dim
ub = [1]*dim

# Run PSO (use small number of particles/iterations for demo)
best_position, best_error = pso(fitness_func, lb, ub, swarmsize=10, maxiter=5)

selected_indices = np.where(best_position > 0.5)[0]
print("Number of selected features:", len(selected_indices))

# Retrain model with selected features
X_train_sel = X_train[:, selected_indices]
X_test_sel = X_test[:, selected_indices]
model = LogisticRegression(max_iter=200)
model.fit(X_train_sel, y_train)
y_pred = model.predict(X_test_sel)
print("Accuracy after PSO feature selection:", accuracy_score(y_test, y_pred))


ModuleNotFoundError: No module named 'pyswarm'