In [7]:
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Step 1: Load the Dataset
# Define the dataset path
DATASET_PATH = r"C:\Users\nadaf\OneDrive\Desktop\Quantbit\aclImdb"

def load_data(dataset_path):
    reviews = []
    labels = []

    for label_type in ['pos', 'neg']:
        dir_name = os.path.join(dataset_path, label_type)
        for file_name in os.listdir(dir_name):
            if file_name.endswith(".txt"):
                with open(os.path.join(dir_name, file_name), encoding="utf-8") as f:
                    reviews.append(f.read())
                labels.append(1 if label_type == 'pos' else 0)

    return reviews, labels

# Load train and test data
train_data, train_labels = load_data(os.path.join(DATASET_PATH, 'train'))
test_data, test_labels = load_data(os.path.join(DATASET_PATH, 'test'))

# Step 2: Preprocess and Vectorize Text Data
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

# Fit on train data and transform both train and test data
X_train = vectorizer.fit_transform(train_data)
X_test = vectorizer.transform(test_data)

y_train = np.array(train_labels)
y_test = np.array(test_labels)

# Step 3: Train a Logistic Regression Model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Step 4: Evaluate the Model
predictions = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, predictions))
print("Classification Report:\n", classification_report(y_test, predictions))

# Step 5: Test with Custom Input
while True:
    review = input("Enter a review (or type 'exit' to quit): ")
    if review.lower() == 'exit':
        break
    review_vec = vectorizer.transform([review])
    prediction = model.predict(review_vec)
    sentiment = "Positive" if prediction[0] == 1 else "Negative"
    print(f"Predicted Sentiment: {sentiment}")


Accuracy: 0.87784
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.87      0.88     12500
           1       0.88      0.88      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000



Enter a review (or type 'exit' to quit):  great


Predicted Sentiment: Positive


Enter a review (or type 'exit' to quit):  worst


Predicted Sentiment: Negative


Enter a review (or type 'exit' to quit):  neutral


Predicted Sentiment: Positive



KeyboardInterrupt



In [None]:
import os
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Define the dataset path
DATASET_PATH = r"C:\Users\nadaf\OneDrive\Desktop\Quantbit\aclImdb"

# Function to load data from the dataset
def load_data(dataset_path):
    """
    Reads text files from the dataset directory, categorizes them as positive or negative, and returns them.
    """
    reviews = []  # List to store review texts
    labels = []   # List to store corresponding labels (1 for positive, 0 for negative)

    for label_type in ['pos', 'neg']:  # Iterate over positive and negative reviews
        dir_name = os.path.join(dataset_path, label_type)  # Path to the specific folder
        for file_name in os.listdir(dir_name):  # Iterate over all files in the folder
            if file_name.endswith(".txt"):  # Ensure the file is a text file
                with open(os.path.join(dir_name, file_name), encoding="utf-8") as f:
                    reviews.append(f.read())  # Append the review text
                labels.append(1 if label_type == 'pos' else 0)  # Append 1 for positive, 0 for negative

    return reviews, labels

# Load train and test data
train_data, train_labels = load_data(os.path.join(DATASET_PATH, 'train'))  # Training data
test_data, test_labels = load_data(os.path.join(DATASET_PATH, 'test'))    # Test data

# Convert text data to numerical data using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)  # Use TF-IDF to extract top 5000 features
X_train = vectorizer.fit_transform(train_data)  # Fit and transform training data
X_test = vectorizer.transform(test_data)        # Transform test data using the same vectorizer

y_train = np.array(train_labels)  # Convert training labels to a NumPy array
y_test = np.array(test_labels)    # Convert test labels to a NumPy array

# Train a Logistic Regression model
model = LogisticRegression(max_iter=1000)  # Initialize Logistic Regression with a high iteration limit
model.fit(X_train, y_train)  # Train the model on training data

# Predict on test data
predictions = model.predict(X_test)  # Get predictions for the test data

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, predictions))  # Print accuracy of the model
print("Classification Report:\n", classification_report(y_test, predictions))  # Print detailed performance metrics

# Interactive test for user-provided reviews
while True:
    review = input("Enter a review (or type 'exit' to quit): ")  # Prompt user for input
    if review.lower() == 'exit':  # Exit the loop if user types 'exit'
        break
    review_vec = vectorizer.transform([review])  # Transform the review into numerical features
    prediction = model.predict(review_vec)  # Predict sentiment for the review
    sentiment = "Positive" if prediction[0] == 1 else "Negative"  # Interpret prediction
    print(f"Predicted Sentiment: {sentiment}")  # Display the result


In [23]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Path to the dataset
DATASET_PATH = r"C:\Users\nadaf\OneDrive\Desktop\Quantbit\aclImdb"

# Function to load data and save it to a CSV file
def load_and_save_data(dataset_path, output_csv):
    """Load reviews and labels from text files and save to a CSV file."""
    # Initialize lists for reviews and labels
    reviews=[]
    labels = []

    for dataset_type in ['train', 'test']:
        for label_type in ['pos', 'neg']:
            folder_path = os.path.join(dataset_path, dataset_type, label_type)
            if not os.path.exists(folder_path):
                raise FileNotFoundError(f"Folder not found: {folder_path}")

            for file_name in os.listdir(folder_path):
                if file_name.endswith(".txt"):
                    with open(os.path.join(folder_path, file_name), encoding="utf-8") as file:
                        reviews.append(file.read())
                    labels.append(1 if label_type == 'pos' else 0)

    # Save the data to a CSV file
    data = pd.DataFrame({"review": reviews, "label": labels})
    data.to_csv(output_csv, index=False)
    return data

# Save reviews and labels to a CSV file
output_csv = "movie_reviews.csv"
data = load_and_save_data(DATASET_PATH, output_csv)
print(f"Data saved to {output_csv}")

# Load data from the CSV file
data = pd.read_csv(output_csv)

# Split data into training and testing sets
train_data = data.sample(frac=0.8, random_state=42)
test_data = data.drop(train_data.index)

# Separate features (reviews) and labels
X_train, y_train = train_data["review"], train_data["label"]
X_test, y_test = test_data["review"], test_data["label"]

# Convert text data to numerical format using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train a logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)

# Evaluate the model
predictions = model.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, predictions))
print("Classification Report:\n", classification_report(y_test, predictions))


Data saved to movie_reviews.csv
Accuracy: 0.8852
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.88      0.88      4973
           1       0.88      0.90      0.89      5027

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [21]:
# Test with a single user-provided review
review = input("Enter a review: ")  # Prompt user for a single input
review_vec = vectorizer.transform([review])  # Transform the review into numerical features
prediction = model.predict(review_vec)  # Predict sentiment for the review

# Interpret the prediction
if prediction[0] == 1:
    sentiment = "Positive"
else:
    sentiment = "Negative"

# Display the result
print(f"Predicted Sentiment: {sentiment}")  # Display the result


Enter a review:  good


Predicted Sentiment: Positive
