In [29]:
import re
import boto3
import mlflow
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from evidently import ColumnMapping
from evidently.report import Report
from evidently.metric_preset import TextOverviewPreset, ClassificationPreset

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [1]:
import pandas as pd

# Download model

In [5]:
logged_model = f's3://mlops-zoomcamp-quocvo/1/fe0a57b41cf34581a9e613324c061ef6/artifacts/model'
model = mlflow.pyfunc.load_model(logged_model)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

# Load data

In [6]:
data = pd.read_csv('../../data/raw/cyberbullying_tweets.csv')
data.shape

(47692, 2)

In [7]:
data.head()

Unnamed: 0,tweet_text,cyberbullying_type
0,"In other words #katandandre, your food was cra...",not_cyberbullying
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying


In [8]:
def convert_target(x):
    return x if x == 'not_cyberbullying' else 'cyberbullying'

In [9]:
data['cyberbullying_type'] = data['cyberbullying_type'].apply(convert_target)

In [10]:
new_df = data[['tweet_text', 'cyberbullying_type']].copy()

In [11]:
min_count = new_df['cyberbullying_type'].value_counts().min()

In [12]:
df_balanced = new_df.groupby('cyberbullying_type').apply(lambda x: x.sample(min_count)).reset_index(drop=True)

In [13]:
value_counts = df_balanced['cyberbullying_type'].value_counts()
value_counts

cyberbullying        7945
not_cyberbullying    7945
Name: cyberbullying_type, dtype: int64

In [14]:
df_saved = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

In [17]:
df_saved.tail()

Unnamed: 0,tweet_text,cyberbullying_type
15885,Got accepted to my first college!,not_cyberbullying
15886,Christian Woman in Pakistan Dies After Being S...,cyberbullying
15887,Now my dumb nigger ass turn right around and f...,cyberbullying
15888,Seriously stoked the girls got through they ar...,not_cyberbullying
15889,"Hey, sand nigger singing in fucking hindu, shu...",cyberbullying


In [16]:
df_saved.to_csv('cyberbullying_tweets_new.csv', index=False)

# Split data

In [36]:
reference, current = train_test_split(use, test_size=0.2, random_state=42, stratify=use['cyberbullying_type'])

In [37]:
reference.shape

(15260, 2)

In [38]:
current.shape

(3816, 2)

In [39]:
def clean_text(text):
    # Remove HTML tags
    text = re.sub('<.*?>', '', text)
    
    # Remove non-alphabetic characters and convert to lowercase
    text = re.sub('[^a-zA-Z]', ' ', text).lower()
    
    # Remove URLs, mentions, and hashtags from the text
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@\S+', '', text)
    text = re.sub(r'#\S+', '', text)
    
    # Tokenize the text
    words = nltk.word_tokenize(text)
    
    # Remove stopwords
    words = [w for w in words if w not in stopwords.words('english')]
    
    # Stem the words
    stemmer = PorterStemmer()
    words = [stemmer.stem(w) for w in words]
    
    # Join the words back into a string
    text = ' '.join(words)
    return text

In [40]:
current['processed_text'] = current['tweet_text'].apply(lambda x: clean_text(x))
reference['processed_text'] = reference['tweet_text'].apply(lambda x: clean_text(x))

In [41]:
current['prediction'] = model.predict(current.processed_text)
reference['prediction'] = model.predict(reference.processed_text)

In [42]:
def calculate_metrics(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='micro')
    recall = recall_score(y_true, y_pred, average='micro')
    f1 = f1_score(y_true, y_pred, average='micro')
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [43]:
# Train Metrics
calculate_metrics(current.cyberbullying_type, current.prediction)

{'accuracy': 0.495020964360587,
 'precision': 0.495020964360587,
 'recall': 0.495020964360587,
 'f1': 0.495020964360587}

In [44]:
# Reference Metrics
calculate_metrics(reference.cyberbullying_type, reference.prediction)

{'accuracy': 0.4939056356487549,
 'precision': 0.4939056356487549,
 'recall': 0.4939056356487549,
 'f1': 0.4939056356487549}

# Evidently Report

In [45]:
col_mapping = ColumnMapping(
    text_features=['processed_text'],
    target = 'cyberbullying_type',
    prediction = 'prediction'
)

In [46]:
report = Report(metrics=[
    TextOverviewPreset(column_name='processed_text'),
    ClassificationPreset()
])

In [47]:
report.run(current_data=current, reference_data=reference, column_mapping=col_mapping)

ZeroDivisionError: division by zero