In [2]:
# Complet   NLP_Pipelines to creat data
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix
import re

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [3]:
class TextPreprocessor:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))

    def preprocess(self, text):
        # Convert to lowercase
        text = text.lower()

        # Remove special characters and numbers
        text = re.sub(r'[^a-zA-Z\s]', '', text)

        # Tokenize
        tokens = word_tokenize(text)

        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(token)
                 for token in tokens
                 if token not in self.stop_words]

        return ' '.join(tokens)

class NLPPipeline:
    def __init__(self, max_features=5000):
        self.preprocessor = TextPreprocessor()
        self.vectorizer = TfidfVectorizer(
            max_features=max_features,
            ngram_range=(1, 2)  # Include unigrams and bigrams
        )

        # Initialize different models
        self.models = {
            'naive_bayes': MultinomialNB(),
            'logistic_regression': LogisticRegression(max_iter=1000),
            'svm': LinearSVC(max_iter=1000)
        }

    def prepare_data(self, texts, labels):
        # Preprocess all texts
        processed_texts = [self.preprocessor.preprocess(text) for text in texts]

        # Create TF-IDF features
        X = self.vectorizer.fit_transform(processed_texts)

        # Split data
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, labels, test_size=0.2, random_state=42
        )

    def train_and_evaluate(self):
        results = {}

        for name, model in self.models.items():
            # Train model
            model.fit(self.X_train, self.y_train)

            # Make predictions
            y_pred = model.predict(self.X_test)

            # Calculate metrics
            results[name] = {
                'classification_report': classification_report(self.y_test, y_pred),
                'confusion_matrix': confusion_matrix(self.y_test, y_pred)
            }

        return results

    def predict_new(self, texts):
        # Preprocess new texts
        processed_texts = [self.preprocessor.preprocess(text) for text in texts]

        # Transform to TF-IDF features
        X_new = self.vectorizer.transform(processed_texts)

        # Make predictions with each model
        predictions = {}
        for name, model in self.models.items():
            predictions[name] = model.predict(X_new)

        return predictions

# Example usage
if __name__ == "__main__":
    # Sample data (replace with your actual dataset)
    texts = [
        "This movie is memorable ",
        "You also hurt me and disappointed",
        " I hate you",
        "This movie was fantastic! I loved every minute of it.",
        "Terrible waste of time. Worst movie ever.",
        "Great acting, but the plot was weak.",
        # ... add more examples
    ]

    labels = [1, 0, 0, 1, 0, 1]  # 1 for positive, 0 for negative

    # Create and run pipeline
    pipeline = NLPPipeline(max_features=3000)

    # Prepare data
    pipeline.prepare_data(texts, labels)

    # Train and evaluate models
    results = pipeline.train_and_evaluate()

    # Print results
    for model_name, metrics in results.items():
        print(f"\nResults for {model_name}:")
        print("\nClassification Report:")
        print(metrics['classification_report'])
        print("\nConfusion Matrix:")
        print(metrics['confusion_matrix'])

    # Example of predicting new texts
    new_texts = [
        "I really enjoyed this movie!",
        "This was a complete disappointment."
    ]

    predictions = pipeline.predict_new(new_texts)
    for model_name, preds in predictions.items():
        print(f"\nPredictions from {model_name}:")
        for text, pred in zip(new_texts, preds):
            print(f"Text: {text}")
            print(f"Prediction: {'Positive' if pred == 1 else 'Negative'}")



Results for naive_bayes:

Classification Report:
              precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       0.00      0.00      0.00         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2


Confusion Matrix:
[[1 0]
 [1 0]]

Results for logistic_regression:

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.50      1.00      0.67         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2


Confusion Matrix:
[[0 1]
 [0 1]]

Results for svm:

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
