In [2]:
pip install -U "prefect==2.16.2"

Collecting prefect==2.16.2
  Downloading prefect-2.16.2-py3-none-any.whl.metadata (10 kB)
Collecting aiosqlite>=0.17.0 (from prefect==2.16.2)
  Using cached aiosqlite-0.20.0-py3-none-any.whl.metadata (4.3 kB)
Collecting apprise<2.0.0,>=1.1.0 (from prefect==2.16.2)
  Using cached apprise-1.7.4-py3-none-any.whl.metadata (44 kB)
Collecting asyncpg>=0.23 (from prefect==2.16.2)
  Using cached asyncpg-0.29.0-cp312-cp312-win_amd64.whl.metadata (4.5 kB)
Collecting dateparser<2.0.0,>=1.1.1 (from prefect==2.16.2)
  Using cached dateparser-1.2.0-py2.py3-none-any.whl.metadata (28 kB)
Collecting docker<7.0,>=4.0 (from prefect==2.16.2)
  Using cached docker-6.1.3-py3-none-any.whl.metadata (3.5 kB)
Collecting graphviz>=0.20.1 (from prefect==2.16.2)
  Using cached graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Collecting griffe>=0.20.0 (from prefect==2.16.2)
  Using cached griffe-0.42.1-py3-none-any.whl.metadata (6.2 kB)
Collecting kubernetes<30.0.0,>=24.2.0 (from prefect==2.16.2)
  Using cached ku

  error: subprocess-exited-with-error
  
  × Preparing metadata (pyproject.toml) did not run successfully.
  │ exit code: 1
  ╰─> [6 lines of output]
      Checking for Rust toolchain....
      
      Cargo, the Rust package manager, is not installed or is not on PATH.
      This package requires Rust and Cargo to compile extensions. Install it through
      the system's package manager or via https://rustup.rs/
      
      [end of output]
  
  note: This error originates from a subprocess, and is likely not a problem with pip.
error: metadata-generation-failed

× Encountered error while generating package metadata.
╰─> See above for output.

note: This is an issue with the package mentioned above, not pip.
hint: See above for details.


In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import time
import joblib
import os
import prefect
from prefect import task, Flow

@task
def load_data(file_path):
    df = pd.read_csv(file_path)
    return df.dropna()

@task
def preprocess_data(df):
    ratings_dict = {5.0: 1, 4.0: 1, 3.0: 0, 2.0: 0, 1.0: 0}
    df['sentiment'] = df['Ratings'].map(ratings_dict)
    x = df['Review text']
    y = df['sentiment']
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=50)

    def clean_text(text):
        text = re.sub(r"[^a-zA-Z]", " ", text)
        text = re.sub(r'\W+', ' ', text)
        text = text.translate(str.maketrans('', '', string.punctuation))
        text = text.lower()
        stop_words = set(stopwords.words('english'))
        words = text.split()
        cleaned_words = [word for word in words if word not in stop_words]
        return ' '.join(cleaned_words)

    def lemmatize_text(text):
        lemmatizer = WordNetLemmatizer()
        tokens = nltk.word_tokenize(text)
        lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]
        return ' '.join(lemmatized_words)

    x_train = x_train.apply(clean_text).apply(lemmatize_text)
    x_test = x_test.apply(clean_text).apply(lemmatize_text)
    
    return x_train, x_test, y_train, y_test

@task
def train_model(x_train, y_train):
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('classifier', LogisticRegression())
    ])

    param_grid = {
        'tfidf__max_features': [1000, 2000, 3000],
        'classifier__C': [0.1, 1, 10],
        'classifier__penalty': ['l1', 'l2']
    }

    grid_search = GridSearchCV(estimator=pipeline,
                               param_grid=param_grid,
                               cv=5,
                               scoring='accuracy',
                               return_train_score=True,
                               verbose=1
                               )
    grid_search.fit(x_train, y_train)
    return grid_search.best_estimator_

@task
def evaluate_model(model, x_test, y_test):
    y_pred = model.predict(x_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    return test_accuracy

with Flow("Sentiment Analysis") as flow:
    file_path = "data.csv"
    df = load_data(file_path)
    x_train, x_test, y_train, y_test = preprocess_data(df)
    model = train_model(x_train, y_train)
    test_accuracy = evaluate_model(model, x_test, y_test)

flow.run()


ModuleNotFoundError: No module named 'prefect'