In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
def load_data(file_path):
    """
    Load data from a CSV file.
    """
    return pd.read_csv(file_path)

In [3]:
def split_inputs_output(data, inputs, output):
    """
    Split features and target variables.
    """
    X = data[inputs]
    y = data[output]
    return X, y

In [4]:
df=pd.read_csv('data.csv')
df.dropna(inplace=True)

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from nltk.corpus import stopwords
from prefect import flow, task
from sklearn.naive_bayes import MultinomialNB
import re
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()
la=LabelEncoder()

@task
def load_dat(file_path):
    """
    Load data from a CSV file.
    """
    return pd.read_csv(file_path)


@task
def split_inputs_outpu(data, inputs, output):
    """
    Split features and target variables.
    """
    X = data[inputs]
    y = data[output]
    return X, y
	

@task
def split_train_te(X, y, test_size=0.25, random_state=0):
    """
    Split data into train and test sets.
    """
    return train_test_split(X, y, test_size=test_size, random_state=random_state)
	
	
@task
def preprocess_da(X_train, X_test, y_train, y_test):
    """
    Rescale the data.
    """
    tfid=TfidfVectorizer()
    X_train_id = tfid.fit_transform(X_train)
    X_test_id = tfid.transform(X_test)
    return X_train_id, X_test_id, y_train, y_test
	

@task
def train_mod(X_train_scaled, y_train):
    """
    Training the machine learning model.
    """
    clf = MultinomialNB()
    clf.fit(X_train_scaled, y_train)
    return clf
	

@task
def evaluate_mod(model, X_train_scaled, y_train, X_test_scaled, y_test):
    """
    Evaluating the model.
    """
    y_train_pred = model.predict(X_train_scaled)
    y_test_pred = model.predict(X_test_scaled)

    train_score = metrics.accuracy_score(y_train, y_train_pred)
    test_score = metrics.accuracy_score(y_test, y_test_pred)
    
    return train_score, test_score

@task
def calculate(ratings):
    l=[]
    for i in range(len(df['Ratings'])):
        if int(df['Ratings'].iloc[i]) >= 4:
            l.append('positive')
        else:
            l.append('negative')
    return l

@task
def preprocess_(text):
    text = str(text)
    sentence = re.sub("[^a-zA-Z]", " ", text)
    sentence = sentence.lower()
    tokens = sentence.split()
    tokens = [t for t in tokens if not t in stopwords.words("english")]
    tokens = [lemmatizer.lemmatize(word,pos='v') for word in tokens]
    demo=pd.Series([" ".join(tokens), len(tokens)])
    return demo[0]


# Workflow
@flow(name="Random_nb")
def workflow():
    DATA_PATH = "data.csv"
    INPUTS = 'Review text'
    OUTPUT = 'Ratings'
    parameters = {
    'tfidf__max_df': (0.25, 0.5, 0.75),
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'nb__alpha': (0.1, 0.5, 1)
}
    # Load data
    sentiment = load_dat(DATA_PATH)

    # Identify Inputs and Output
    X=df['Review text']
    val=calculate(df['Ratings'])
    y=la.fit_transform(val)
    # Split data into train and test sets
    X_train, X_test, y_train, y_test = split_train_te(X, y)

    # Preprocess the data
    X_train_scaled, X_test_scaled, y_train, y_test = preprocess_da(X_train, X_test, y_train, y_test)

    # Build a model
    model = train_mod(X_train_scaled, y_train)
    
    # Evaluation
    train_score, test_score = evaluate_mod(model, X_train_scaled, y_train, X_test_scaled, y_test)
    
    print("Train Score:", train_score)
    print("Test Score:", test_score)



if __name__ == "__main__":
    workflow()



  next(self.gen)
  next(self.gen)


Train Score: 0.8894990847062739
Test Score: 0.8622754491017964


In [6]:
from prefect import flow, task

@task
def print_hello(name):
    print(f"Hello {name}!")

@flow(name="Hello Flow")
def hello_world(name="world"):
    print_hello(name)

In [7]:
import httpx
from prefect import flow


@flow
def get_repo_info():
    url = "https://api.github.com/repos/PrefectHQ/prefect"
    response = httpx.get(url)
    response.raise_for_status()
    repo = response.json()
    print("PrefectHQ/prefect repository statistics 🤓:")
    print(f"Stars 🌠 : {repo['stargazers_count']}")
    print(f"Forks 🍴 : {repo['forks_count']}")

if __name__ == "__main__":
    get_repo_info()

PrefectHQ/prefect repository statistics 🤓:
Stars 🌠 : 14366
Forks 🍴 : 1430


In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier

from sklearn import metrics

from prefect import task, flow

@task
def load_data3(file_path):
    """
    Load data from a CSV file.
    """
    return pd.read_csv(file_path)


@task
def split_inputs_output3(data, inputs, output):
    """
    Split features and target variables.
    """
    X = data[inputs]
    y = data[output]
    return X, y
	

@task
def split_train_test3(X, y, test_size=0.25, random_state=0):
    """
    Split data into train and test sets.
    """
    return train_test_split(X, y, test_size=test_size, random_state=random_state)
	
	
@task
def preprocess_data3(X_train, X_test, y_train, y_test):
    """
    Rescale the data.
    """
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled, y_train, y_test
	

@task
def train_model3(X_train_scaled, y_train, hyperparameters):
    """
    Training the machine learning model.
    """
    clf = KNeighborsClassifier(**hyperparameters)
    clf.fit(X_train_scaled, y_train)
    return clf
	

@task
def evaluate_model3(model, X_train_scaled, y_train, X_test_scaled, y_test):
    """
    Evaluating the model.
    """
    y_train_pred = model.predict(X_train_scaled)
    y_test_pred = model.predict(X_test_scaled)

    train_score = metrics.accuracy_score(y_train, y_train_pred)
    test_score = metrics.accuracy_score(y_test, y_test_pred)
    
    return train_score, test_score


# Workflow
@flow(name="variable1")
def workflow():
    DATA_PATH = "data.csv"
    INPUTS = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']
    OUTPUT = 'Species'
    HYPERPARAMETERS = {'n_neighbors': 3, 'p': 2}
    
    # Load data
    iris = load_data3(DATA_PATH)

    # Identify Inputs and Output
    X, y = split_inputs_output3(iris, INPUTS, OUTPUT)

    # Split data into train and test sets
    X_train, X_test, y_train, y_test = split_train_test3(X, y)

    # Preprocess the data
    X_train_scaled, X_test_scaled, y_train, y_test = preprocess_data3(X_train, X_test, y_train, y_test)

    # Build a model
    model = train_model3(X_train_scaled, y_train, HYPERPARAMETERS)
    
    # Evaluation
    train_score, test_score = evaluate_model3(model, X_train_scaled, y_train, X_test_scaled, y_test)
    
    print("Train Score:", train_score)
    print("Test Score:", test_score)



if __name__ == "__main__":
    workflow()


 `@task(name='my_unique_name', ...)`

 `@task(name='my_unique_name', ...)`

 `@task(name='my_unique_name', ...)`

 `@task(name='my_unique_name', ...)`

 `@task(name='my_unique_name', ...)`

 `@task(name='my_unique_name', ...)`

 `@flow(name='my_unique_name', ...)`


KeyError: "None of [Index(['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm'], dtype='object')] are in the [columns]"