### Running the previous step in the pipeline: Importing the dataset.

In [1]:
%run ./1.importing_dataset.ipynb

  from pandas.core import (


DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'source', 'reasoning'],
        num_rows: 80000
    })
    validation: Dataset({
        features: ['text', 'label', 'source', 'reasoning'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['text', 'label', 'source', 'reasoning'],
        num_rows: 10200
    })
})
                                                text            label  \
0  Your flight has been rescheduled for 10:00 AM ...          neutral   
1  We're happy to accommodate your dietary prefer...           polite   
2  Our vegetarian options are available on the me...          neutral   
3  I understand your frustration with the recent ...  somewhat polite   
4  I'll do my best to find a suitable replacement...  somewhat polite   

                                  source  \
0  meta-llama/Meta-Llama-3.1-8B-Instruct   
1  meta-llama/Meta-Llama-3.1-8B-Instruct   
2  meta-llama/Meta-Llama-3.1-8B-Instruct   
3  meta-llama/Meta-Llama-3.1

## **Extracting text corpus**
##### We have to extract the text from the documents in te dataset so we can use different representations to operate on.
##### Note that this is an unclean version of the corpus

In [2]:
unclean_corpus = []
for i in range(0, len(training_set["text"])):
    unclean_corpus.append(training_set['text'][i]);
print(unclean_corpus[0:5]);


unclean_corpus_test = []
for i in range(0, len(test_set["text"])):
    unclean_corpus_test.append(test_set['text'][i]);
print(unclean_corpus_test[0:5]);

["Your flight has been rescheduled for 10:00 AM tomorrow. Please check the airport's website for any updates or changes.", "We're happy to accommodate your dietary preferences. Our vegetarian options are carefully crafted to ensure a delicious and satisfying meal. Would you like me to recommend some dishes that fit your needs?", 'Our vegetarian options are available on the menu, and our chef can modify any dish to suit your dietary needs.', "I understand your frustration with the recent tournament results, and I'll review the standings to see what we can do to improve your experience.", "I'll do my best to find a suitable replacement for the item you're looking for, but I need to know more about what you're looking for."]
['I appreciate your interest in our vegetarian options. I can provide you with a list of our current dishes that cater to your dietary preferences.', "I understand you're concerned about the ski lessons, and I'll look into the options for rescheduling.", 'Our technica

## **Cleaning the text corpus**
##### Now we need to process the unclean text corpus, by performing actions such as:
- ##### Removing punctuation;
- ##### Lower case folding;
- ##### Stemming (using PorterStemmer);
- ##### Removing Stop Words (optional);
##### For that effect we will import [regular expression](https://docs.python.org/3/library/re.html) library and [nltk](https://www.nltk.org/api/nltk.html)

In [3]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4') 
from nltk.corpus import wordnet


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/magicojayz/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/magicojayz/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [7]:
import nltk
import re
from nltk.corpus import stopwords, wordnet
from nltk.stem.porter import PorterStemmer

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

synonym_cache = {}

def freq_syn(word):
    if word in synonym_cache:
        return synonym_cache[word]
    
    synsets = wordnet.synsets(word)
    if not synsets:
        synonym_cache[word] = word 
        return word
    
    lemmas = []
    for syn in synsets:
        for lemma in syn.lemmas():
            lemmas.append((lemma.name(), lemma.count()))

    if not lemmas:
        synonym_cache[word] = word
        return word

    sorted_lemmas = sorted(lemmas, key=lambda x: x[1], reverse=True)
    most_freq = sorted_lemmas[0][0].replace('_', ' ').lower()
    synonym_cache[word] = most_freq
    return most_freq

clean_corpus_syn = []

for text in unclean_corpus:
    text = re.sub('[^a-zA-Z]', ' ', text).lower()
    processed_words = []
    for word in text.split():
        if word not in stop_words:
            synonym_word = freq_syn(word)
            processed_words.append(synonym_word)

    cleaned_text = ' '.join(processed_words)
    clean_corpus_syn.append(cleaned_text)

print(clean_corpus_syn[:5])


clean_corpus_test_syn = []
for text in unclean_corpus_test:
    text = re.sub('[^a-zA-Z]', ' ', text).lower()
    processed_words = []
    for word in text.split():
        if word not in stop_words:
            synonym_word = freq_syn(word)
            processed_words.append(synonym_word)

    cleaned_text = ' '.join(processed_words)
    clean_corpus_test_syn.append(cleaned_text)
print(clean_corpus_test_syn[0:5])

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/magicojayz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['flight reschedule tomorrow please check airport web site update change', 'happy fit dietary taste vegetarian alternative carefully craft see delightful meet meal would like recommend serve meet require', 'vegetarian alternative available fare chef change serve case dietary require', 'see frustration recent tournament effect review stand see improve have', 'good find suited substitute item seem require know seem']
['appreciate interest vegetarian alternative provide list current serve provide dietary taste', 'see concern ski lesson seem alternative reschedule', 'technical skill of course cross requirement matter data analysis include data visualization statistical pattern of course material available learn platform', 'buffet hour prime minister please note limited selection alternative available lunch disclose', 'seem policy detail see alternative available']


In [None]:
ps = PorterStemmer();
sw = stopwords.words('english');
clean_corpus = []
for i in range(0,len(unclean_corpus)):
    text = re.sub('[^a-zA-Z]', ' ', unclean_corpus[i])
    text = text.lower()
    text = [ps.stem(word) for word in text.split() if not word in sw]
    text = ' '.join(text)
    clean_corpus.append(text)
print(clean_corpus[0:5])

clean_corpus_test = []
for i in range(0,len(unclean_corpus_test)):
    text = re.sub('[^a-zA-Z]', ' ', unclean_corpus_test[i])
    text = text.lower()
    text = [ps.stem(word) for word in text.split() if not word in sw]
    text = ' '.join(text)
    clean_corpus_test.append(text)
print(clean_corpus_test[0:5])

['flight reschedul tomorrow pleas check airport websit updat chang', 'happi accommod dietari prefer vegetarian option care craft ensur delici satisfi meal would like recommend dish fit need', 'vegetarian option avail menu chef modifi dish suit dietari need', 'understand frustrat recent tournament result review stand see improv experi', 'best find suitabl replac item look need know look']
['appreci interest vegetarian option provid list current dish cater dietari prefer', 'understand concern ski lesson look option reschedul', 'technic skill cours cover essenti topic data analysi includ data visual statist model cours materi avail learn platform', 'buffet hour pm pleas note limit select option avail lunch break', 'look polici detail see option avail']
