In [None]:
pip install numpy pandas scikit-learn nltk



In [9]:
import os
import tarfile
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')
nltk.download('stopwords')

def extract_data(file_path):
    if file_path.endswith(".tar.gz"):
        with tarfile.open(file_path, "r:gz") as tar:
            tar.extractall()
            print("Data extracted!")

extract_data("rt-polaritydata.tar.gz")

def load_data():
    with open('rt-polaritydata/rt-polarity.pos', 'r', encoding='ISO-8859-1') as pos_file:
        pos_sentences = pos_file.readlines()
    with open('rt-polaritydata/rt-polarity.neg', 'r', encoding='ISO-8859-1') as neg_file:
        neg_sentences = neg_file.readlines()

    pos_df = pd.DataFrame(pos_sentences, columns=['review'])
    pos_df['label'] = 1

    neg_df = pd.DataFrame(neg_sentences, columns=['review'])
    neg_df['label'] = 0

    return pos_df, neg_df

pos_df, neg_df = load_data()

def split_data(pos_df, neg_df):
    pos_train = pos_df.iloc[:4000]
    pos_val = pos_df.iloc[4000:4500]
    pos_test = pos_df.iloc[4500:]

    neg_train = neg_df.iloc[:4000]
    neg_val = neg_df.iloc[4000:4500]
    neg_test = neg_df.iloc[4500:]

    train_data = pd.concat([pos_train, neg_train]).sample(frac=1).reset_index(drop=True)
    val_data = pd.concat([pos_val, neg_val]).sample(frac=1).reset_index(drop=True)
    test_data = pd.concat([pos_test, neg_test]).sample(frac=1).reset_index(drop=True)

    return train_data, val_data, test_data

train_data, val_data, test_data = split_data(pos_df, neg_df)

def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text.lower())
    words = [word for word in words if word.isalnum() and word not in stop_words]
    return " ".join(words)

train_data['review'] = train_data['review'].apply(preprocess_text)
val_data['review'] = val_data['review'].apply(preprocess_text)
test_data['review'] = test_data['review'].apply(preprocess_text)

vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_data['review'])
X_val = vectorizer.transform(val_data['review'])
X_test = vectorizer.transform(test_data['review'])

y_train = train_data['label']
y_val = val_data['label']
y_test = test_data['label']

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

val_predictions = clf.predict(X_val)
val_accuracy = accuracy_score(y_val, val_predictions)
print(f"Validation Accuracy: {val_accuracy:.4f}")
print(classification_report(y_val, val_predictions))

test_predictions = clf.predict(X_test)
test_accuracy = accuracy_score(y_test, test_predictions)
print(f"Test Accuracy: {test_accuracy:.4f}")
print(classification_report(y_test, test_predictions))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Data extracted!
Validation Accuracy: 0.7590
              precision    recall  f1-score   support

           0       0.74      0.79      0.77       500
           1       0.78      0.73      0.75       500

    accuracy                           0.76      1000
   macro avg       0.76      0.76      0.76      1000
weighted avg       0.76      0.76      0.76      1000

Test Accuracy: 0.7677
              precision    recall  f1-score   support

           0       0.76      0.78      0.77       831
           1       0.78      0.75      0.76       831

    accuracy                           0.77      1662
   macro avg       0.77      0.77      0.77      1662
weighted avg       0.77      0.77      0.77      1662



In [11]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

val_predictions = clf.predict(X_val)

cm = confusion_matrix(y_val, val_predictions)
tn, fp, fn, tp = cm.ravel()

precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2 * (precision * recall) / (precision + recall)

print(f"Validation Results:")
print(f"TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

test_predictions = clf.predict(X_test)

cm_test = confusion_matrix(y_test, test_predictions)
tn_test, fp_test, fn_test, tp_test = cm_test.ravel()

precision_test = tp_test / (tp_test + fp_test)
recall_test = tp_test / (tp_test + fn_test)
f1_test = 2 * (precision_test * recall_test) / (precision_test + recall_test)

print(f"\nTest Results:")
print(f"TP: {tp_test}, TN: {tn_test}, FP: {fp_test}, FN: {fn_test}")
print(f"Precision: {precision_test:.4f}")
print(f"Recall: {recall_test:.4f}")
print(f"F1 Score: {f1_test:.4f}")

Validation Results:
TP: 364, TN: 395, FP: 105, FN: 136
Precision: 0.7761
Recall: 0.7280
F1 Score: 0.7513

Test Results:
TP: 625, TN: 651, FP: 180, FN: 206
Precision: 0.7764
Recall: 0.7521
F1 Score: 0.7641
