In [2]:
import numpy as np
import pandas as pd


df_train = pd.read_csv("C:/Users/tim/Desktop/mlops/data/twitter_training.csv")
df_val = pd.read_csv("C:/Users/tim/Desktop/mlops/data/twitter_validation.csv")
df = pd.concat([df_train, df_val], ignore_index=True)
df.head()


Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,",3364,Facebook,Irrelevant,"I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom‚Äôs great auntie as ‚ÄòHayley can‚Äôt get out of bed‚Äô and told to his grandma, who now thinks I‚Äôm a lazy, terrible person ü§£"
0,2401.0,Borderlands,Positive,I am coming to the borders and I will kill you...,,,,
1,2401.0,Borderlands,Positive,im getting on borderlands and i will kill you ...,,,,
2,2401.0,Borderlands,Positive,im coming on borderlands and i will murder you...,,,,
3,2401.0,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,,,,
4,2401.0,Borderlands,Positive,im getting into borderlands and i can murder y...,,,,


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75680 entries, 0 to 75679
Data columns (total 8 columns):
 #   Column                                                                                                                                                                                                                                              Non-Null Count  Dtype  
---  ------                                                                                                                                                                                                                                              --------------  -----  
 0   2401                                                                                                                                                                                                                                                74681 non-null  float64
 1   Borderlands                                                         

In [4]:
drop_columns = ['2401', '3364', 'Facebook', 'Irrelevant', "I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom‚Äôs great auntie as ‚ÄòHayley can‚Äôt get out of bed‚Äô and told to his grandma, who now thinks I‚Äôm a lazy, terrible person ü§£"]
df = df.drop(columns=drop_columns)
df = df.rename(columns={'Borderlands': 'Branch', 'Positive': 'Sentiment', 'im getting on borderlands and i will murder you all ,': 'Tweet'})

df.head()

Unnamed: 0,Branch,Sentiment,Tweet
0,Borderlands,Positive,I am coming to the borders and I will kill you...
1,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,Borderlands,Positive,im coming on borderlands and i will murder you...
3,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,Borderlands,Positive,im getting into borderlands and i can murder y...


In [5]:
df.isnull().sum()

Branch        999
Sentiment     999
Tweet        1685
dtype: int64

In [6]:
df.dropna(inplace=True)
df.isnull().sum()

Branch       0
Sentiment    0
Tweet        0
dtype: int64

In [7]:
df.duplicated().sum()

np.int64(3038)

In [8]:
df = df.drop_duplicates()
df.duplicated().sum()

np.int64(0)

In [9]:
import re

def toLabel(df, column):
    u = df[column].unique()
    mapping = {label: idx for idx, label in enumerate(u)}
    df[column] = df[column].map(mapping)
    return df

def tostring(df, column):
    df[column] = df[column].astype(str)
    return df

def removeHTMLTags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

def removeURLs(text):
    url_pattern = re.compile(r'http[s]?://\S+|www\.\S+')
    return re.sub(url_pattern, '', text)

def preprocess_text(df):
    df = toLabel(df, 'Sentiment')
    df = tostring(df, 'Tweet')
    df['Tweet'] = df['Tweet'].apply(removeHTMLTags)
    df['Tweet'] = df['Tweet'].apply(removeURLs)
    return df

df_pre = preprocess_text(df)

In [10]:
X = df_pre['Tweet'].tolist()
y = df_pre['Sentiment']

y.head()

0    0
1    0
2    0
3    0
4    0
Name: Sentiment, dtype: int64

In [11]:
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
import torch
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


# parameters
MODEL_NAME = "bert-base-uncased" 
MAX_LENGTH = 128
BATCH_SIZE = 16
NUM_EPOCHS = 3
LEARNING_RATE = 2e-5

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

def tokenize_data(texts, max_len):
    return tokenizer(
        texts,
        max_length=max_len,
        padding='max_length',
        truncation=True,
        return_tensors='pt' 
    )

train_encodings = tokenize_data(X_train, MAX_LENGTH)
val_encodings = tokenize_data(X_val, MAX_LENGTH)



In [None]:
class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels.values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
train_dataset = TweetDataset(train_encodings, y_train)
val_dataset = TweetDataset(val_encodings, y_val)

import mlflow
import os

# ===================================================================
# MLflow Configuration
# ===================================================================

mlflow.set_tracking_uri("http://ec2-54-249-36-142.ap-northeast-1.compute.amazonaws.com:5000/") 

EXPERIMENT_NAME = "BERT_Sentiment_Analysis"
mlflow.set_experiment(EXPERIMENT_NAME)


model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=4)

def compute_metrics(pred):

    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=NUM_EPOCHS,              
    per_device_train_batch_size=BATCH_SIZE,  
    per_device_eval_batch_size=BATCH_SIZE,   
    eval_strategy='epoch',         
    save_strategy='epoch',
    report_to="mlflow",
    run_name="bert",
    learning_rate=LEARNING_RATE,
    metric_for_best_model='accuracy',
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()





Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6053,0.527544,0.808484,0.811211,0.808484,0.806502
2,0.2735,0.327605,0.889656,0.890094,0.889656,0.889414
3,0.136,0.346149,0.909033,0.909202,0.909033,0.908923


üèÉ View run bert at: http://ec2-54-249-36-142.ap-northeast-1.compute.amazonaws.com:5000/#/experiments/421334797618325639/runs/d8218426e05242d582bd0d9bf30c6edb
üß™ View experiment at: http://ec2-54-249-36-142.ap-northeast-1.compute.amazonaws.com:5000/#/experiments/421334797618325639


TrainOutput(global_step=10644, training_loss=0.4289800345740879, metrics={'train_runtime': 2417.3248, 'train_samples_per_second': 70.448, 'train_steps_per_second': 4.403, 'total_flos': 1.120182544286208e+16, 'train_loss': 0.4289800345740879, 'epoch': 3.0})

In [12]:
class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels.values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
train_dataset = TweetDataset(train_encodings, y_train)
val_dataset = TweetDataset(val_encodings, y_val)

In [None]:
# Load the checkpoint
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("./results/checkpoint-10644", num_labels=4)

def compute_metrics(pred):

    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=NUM_EPOCHS,              
    per_device_train_batch_size=BATCH_SIZE,  
    per_device_eval_batch_size=BATCH_SIZE,   
    eval_strategy='epoch',         
    save_strategy='epoch',
    report_to="mlflow",
    run_name="bert",
    learning_rate=LEARNING_RATE,
    metric_for_best_model='accuracy',
    load_best_model_at_end=True
)


# Evaluation
from transformers import Trainer
trainer = Trainer(
    model=model,
    compute_metrics=compute_metrics
)
eval_result = trainer.evaluate(eval_dataset=val_dataset)

# Print evaluation results
print(eval_result)




{'eval_loss': 0.3461490273475647, 'eval_model_preparation_time': 0.003, 'eval_accuracy': 0.9090332581736189, 'eval_precision': 0.9092015201011885, 'eval_recall': 0.9090332581736189, 'eval_f1': 0.908922948806644, 'eval_runtime': 57.7454, 'eval_samples_per_second': 245.768, 'eval_steps_per_second': 30.721}


In [18]:
# Predict on new data
predictions = trainer.predict(test_dataset=val_dataset)
preds = predictions.predictions.argmax(-1)
print(preds)

[0 0 1 ... 3 0 3]


In [None]:
from transformers import AutoTokenizer, pipeline
from transformers import AutoModelForSequenceClassification
import os
import pandas as pd
import numpy as np



Device set to use cuda:0


Unnamed: 0,prob_label_0,prob_label_1,prob_label_2,prob_label_3
0,0.947521,0.000279,0.000938,0.051262
1,0.087461,0.017242,0.844783,0.050514


In [23]:
import mlflow
from mlflow.models import infer_signature
from transformers import AutoTokenizer, pipeline

mlflow.set_tracking_uri("http://ec2-54-249-36-142.ap-northeast-1.compute.amazonaws.com:5000/") 

EXPERIMENT_NAME = "BERT_Sentiment_Analysis"
mlflow.set_experiment(EXPERIMENT_NAME)

with mlflow.start_run():
    current_dir = os.path.abspath(os.getcwd())
    checkpoint_path = os.path.join(current_dir, "results", "checkpoint-10644")

    model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path, num_labels=4)
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

    input_example = pd.DataFrame({
        "text": ["I love this product!", "This is the worst service ever."]
    })

    analysis_pipeline = pipeline(
        "text-classification", 
        model=model, 
        tokenizer=tokenizer,
        return_all_scores=True
    )
    
    preds = analysis_pipeline(input_example['text'].tolist(), truncation=True)
    # convert to numpy array shape (n_samples, n_labels)
    probs = np.array([[d["score"] for d in item] for item in preds])

    col_names = [f"prob_label_{i}" for i in range(probs.shape[1])]
    output_example = pd.DataFrame(probs, columns=col_names)

    signature = infer_signature(input_example, output_example)
    mlflow.transformers.log_model(
        transformers_model=analysis_pipeline,
        artifact_path="model",
        signature=signature,
        input_example=input_example
    )
        


    
    

Device set to use cuda:0


Downloading artifacts:   0%|          | 0/13 [00:00<?, ?it/s]

Device set to use cuda:0


üèÉ View run lyrical-gnat-18 at: http://ec2-54-249-36-142.ap-northeast-1.compute.amazonaws.com:5000/#/experiments/421334797618325639/runs/6f1020a01a2e4bd98e8b2f5a647202cd
üß™ View experiment at: http://ec2-54-249-36-142.ap-northeast-1.compute.amazonaws.com:5000/#/experiments/421334797618325639


In [4]:
import mlflow
import mlflow

# ÂàáÊèõÁÇ∫ÁâπÂÆö run_idÔºàÂÅáË®≠‰Ω†Áü•ÈÅìË©≤ run_idÔºâ

mlflow.set_tracking_uri("http://ec2-54-249-36-142.ap-northeast-1.compute.amazonaws.com:5000/") 

EXPERIMENT_NAME = "BERT_Sentiment_Analysis"
mlflow.set_experiment(EXPERIMENT_NAME)
with mlflow.start_run() as run:
    run_id = run.info.run_id

# ÁµêÊùüÊ≠§ run
mlflow.end_run()


üèÉ View run incongruous-cat-296 at: http://ec2-54-249-36-142.ap-northeast-1.compute.amazonaws.com:5000/#/experiments/421334797618325639/runs/0f227eefe18a44b4af665c99b9799fc3
üß™ View experiment at: http://ec2-54-249-36-142.ap-northeast-1.compute.amazonaws.com:5000/#/experiments/421334797618325639
