In [12]:
import os
import pandas as pd

# Dataset Path
DATASET_PATH = r"C:\Users\nadaf\OneDrive\Desktop\Quantbit\aclImdb"

def save_data(dataset_path, output_csv):
    reviews=[]
    labels = []

    for dataset_type in ['train', 'test']:
        for label_type in ['pos', 'neg']:
            folder_path = os.path.join(dataset_path, dataset_type, label_type)
            if not os.path.exists(folder_path):
                raise FileNotFoundError(f"Folder not found: {folder_path}")

            for file_name in os.listdir(folder_path):
                if file_name.endswith(".txt"):
                    with open(os.path.join(folder_path, file_name), encoding="utf-8") as file:
                        reviews.append(file.read())
                    labels.append(1 if label_type == 'pos' else 0)

    # Save the data to a CSV file
    data = pd.DataFrame({"review": reviews, "label": labels})
    data.to_csv(output_csv, index=False)
    return data

# Save reviews and labels to a CSV file
output_csv = "movie_reviews.csv"
data = save_data(DATASET_PATH, output_csv)
print(f"Data saved to {output_csv}")




Data saved to movie_reviews.csv


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load data from the CSV file
data = pd.read_csv(output_csv)

# Split data into training and testing sets
train_data = data.sample(frac=0.8, random_state=42)
test_data = data.drop(train_data.index)

# Separate features (reviews) and labels
X_train, y_train = train_data["review"], train_data["label"]
X_test, y_test = test_data["review"], test_data["label"]

# Convert text data to numerical format using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train a logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)

# Evaluate the model
predictions = model.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, predictions))
print("Classification Report:\n", classification_report(y_test, predictions))



Accuracy: 0.8852
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.88      0.88      4973
           1       0.88      0.90      0.89      5027

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [19]:
review = input("Enter a review: ")  # Input String
review_vec = vectorizer.transform([review])  # Transform the review into numerical features
prediction = model.predict(review_vec)  # Predict sentiment for the review

if prediction[0] == 1:
    sentiment = "Positive"
else:
    sentiment = "Negative"

print(f"Sentiment is  {sentiment}") 


Enter a review:  The movie is good


Sentiment is  Positive
