In [1]:
%%capture
!pip install torch==1.13.0 transformers==4.24.0 pyarrow==10.0.1 fastparquet==0.8.1 s3fs==2022.11.0 --upgrade

In [3]:
import os
import boto3
import pandas as pd
from transformers import pipeline
import io
from io import StringIO
import torch
import csv
import numpy as np
import sagemaker

In [4]:
print(f"Cuda available:{torch.cuda.is_available()}")

Cuda available:True


In [23]:
s3 = boto3.resource('s3')
bucket_name = 'mlops-team-8'
bucket=s3.Bucket(bucket_name)
directory = 'maildir-results-row/'
results_directory = 'maildir-results/'

In [6]:
model_spam = "mrm8488/bert-tiny-finetuned-enron-spam-detection"
model_spam_path = f"{model_spam}"

nlp_spam = pipeline("text-classification", model = model_spam_path, tokenizer = model_spam_path, device = 0)


Downloading:   0%|          | 0.00/705 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/17.6M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/389 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [14]:
all_files = []
for filename in os.listdir(directory):
    if filename == '.ipynb_checkpoints':
        continue
    all_files.append(directory + filename)

In [17]:
all_files = sorted(all_files)

In [18]:
def get_sentiments(sentiment):
    out_values = [None, None, None]
    for value in sentiment:
        if value["label"] == 'positive':
            out_values[0] = value["score"]
        if value["label"] == 'neutral':
            out_values[1] = value["score"]
        if value["label"] == 'negative':
            out_values[2] = value["score"]

            out = [out_values[0], out_values[1], out_values[2]]
    return out

def extract_summary(summ):
    if isinstance(summ, str):
        return summ
    if isinstance(summ, dict):
        return summ['summary_text']
    return summ

def get_number(list):
    if list is not None:
        number_of_elements = len(list.split(','))
    else:
        number_of_elements = 0
    return number_of_elements

In [31]:
n = len(all_files)
print("starting")
for i in range(n):
    email_file = all_files[i]
    filename = email_file.split('/')[1]
    df = pd.read_json(email_file)
    out_path = f'{results_directory}{filename}'
    if os.path.isfile(out_path) == False:
        df['body_cleansed'] = df['body_cleansed'].fillna('')
        spam = nlp_spam(df["body_cleansed"].tolist(), max_length=512, truncation=True)
        df[['spam_label', 'spam_score']] = pd.json_normalize(spam)
        df.loc[df['spam_label'] == 'LABEL_0', 'spam_label'] = 'Not Spam'
        df.loc[df['spam_label'] == 'LABEL_1', 'spam_label'] = 'Spam'
        #processing sentiment model
        df['sentiment'] = df[['positive','negative','neutral']].idxmax(axis=1)
        #processing summary model
        df["to_count"] = df.to.apply(get_number)
        #resetting index
        df = df.reset_index(drop=True)
        df.to_json(out_path, orient = "records")
    perc = (i+1)/n*100
    print(f"Progress: {perc:.2f}%")



starting
Progress: 0.67%
Progress: 1.33%
Progress: 2.00%
Progress: 2.67%
Progress: 3.33%
Progress: 4.00%
Progress: 4.67%
Progress: 5.33%
Progress: 6.00%
Progress: 6.67%
Progress: 7.33%
Progress: 8.00%
Progress: 8.67%
Progress: 9.33%
Progress: 10.00%
Progress: 10.67%
Progress: 11.33%
Progress: 12.00%
Progress: 12.67%
Progress: 13.33%
Progress: 14.00%
Progress: 14.67%
Progress: 15.33%
Progress: 16.00%
Progress: 16.67%
Progress: 17.33%
Progress: 18.00%
Progress: 18.67%
Progress: 19.33%
Progress: 20.00%
Progress: 20.67%
Progress: 21.33%
Progress: 22.00%
Progress: 22.67%
Progress: 23.33%
Progress: 24.00%
Progress: 24.67%
Progress: 25.33%
Progress: 26.00%
Progress: 26.67%
Progress: 27.33%
Progress: 28.00%
Progress: 28.67%
Progress: 29.33%
Progress: 30.00%
Progress: 30.67%
Progress: 31.33%
Progress: 32.00%
Progress: 32.67%
Progress: 33.33%
Progress: 34.00%
Progress: 34.67%
Progress: 35.33%
Progress: 36.00%
Progress: 36.67%
Progress: 37.33%
Progress: 38.00%
Progress: 38.67%
Progress: 39.33%
Pr

In [32]:
sess = sagemaker.Session()
s3_path_to_data = sess.upload_data(bucket=bucket_name, 
                                                  path=results_directory, 
                                                  key_prefix=results_directory)

In [None]:
notebook = "mlops-team-8"
sm = boto3.client('sagemaker')
sm.stop_notebook_instance(NotebookInstanceName=notebook)