In [1]:
# import libraries
import pandas as pd
import numpy as np
import seaborn as sns 
from matplotlib import pyplot as plt
import matplotlib as mpl
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# change this to your own data directory
data_dir = "data/"

# read and preprocess data
text_file_name = "osdg-community-data-v2023-01-01.csv"
text_df = pd.read_csv(data_dir + text_file_name,sep = "\t",  quotechar='"')
col_names = text_df.columns.values[0].split('\t')
text_df[col_names] = text_df[text_df.columns.values[0]].apply(lambda x: pd.Series(str(x).split("\t")))
text_df = text_df.astype({'sdg':int, 'labels_negative': int, 'labels_positive':int, 'agreement': float}, copy=True)
text_df.drop(text_df.columns.values[0], axis=1, inplace=True)
text_df = text_df.query("agreement > 0.5 and (labels_positive - labels_negative) > 2")
text_df.reset_index(inplace=True, drop=True)



In [2]:
docs = text_df.text
categories = text_df.sdg
X_train, X_test, y_train, y_test = \
    train_test_split(docs, categories, test_size=0.33, random_state=7)

# Solutions to Exercises: Sections 14.5 to 14.7

For exercises 4.1 to 4.3, the below code will be used to establish the dataset used for all next classifications.

In [3]:
X_train_tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,1), stop_words = "english", min_df=5)
X_train_tfidf_vectorizer.fit(X_train)
X_train_tfidf_vector = X_train_tfidf_vectorizer.transform(X_train)
X_test_tfidf_vector = X_train_tfidf_vectorizer.transform(X_test)

**Exercise 4.1**

In [4]:
tfidf_mlp_clf = MLPClassifier(random_state=1, max_iter=100).fit(X_train_tfidf_vector, y_train)
y_pred = tfidf_mlp_clf.predict(X_test_tfidf_vector)
tfidf_mlp_clf.score(X_test_tfidf_vector, y_test)
print(metrics.classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           1       0.79      0.81      0.80       481
           2       0.82      0.87      0.84       316
           3       0.94      0.93      0.94       674
           4       0.92      0.94      0.93       863
           5       0.92      0.92      0.92       920
           6       0.90      0.91      0.91       465
           7       0.88      0.88      0.88       730
           8       0.68      0.61      0.64       353
           9       0.79      0.80      0.79       328
          10       0.64      0.58      0.61       256
          11       0.85      0.88      0.86       462
          12       0.85      0.78      0.81       217
          13       0.85      0.87      0.86       443
          14       0.93      0.92      0.93       263
          15       0.89      0.85      0.87       313
          16       0.97      0.97      0.97      1057

    accuracy                           0.88      8141
   macro avg       0.85   

**Exercise 4.2**

In [5]:
tfidf_multinomialNB_clf = MultinomialNB().fit(X_train_tfidf_vector, y_train)
y_pred = tfidf_multinomialNB_clf.predict(X_test_tfidf_vector)
print(metrics.classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           1       0.70      0.73      0.71       481
           2       0.90      0.67      0.77       316
           3       0.92      0.90      0.91       674
           4       0.76      0.96      0.85       863
           5       0.62      0.94      0.75       920
           6       0.86      0.81      0.84       465
           7       0.60      0.97      0.74       730
           8       0.85      0.08      0.15       353
           9       0.91      0.32      0.48       328
          10       0.89      0.12      0.21       256
          11       0.86      0.75      0.80       462
          12       0.97      0.30      0.46       217
          13       0.84      0.80      0.82       443
          14       0.96      0.61      0.75       263
          15       0.96      0.64      0.77       313
          16       0.86      0.98      0.91      1057

    accuracy                           0.77      8141
   macro avg       0.84   

**Exercise 4.3**

In [6]:
from sklearn.linear_model import RidgeClassifier

tfidf_ridge_clf = RidgeClassifier(tol=1e-2, solver="sparse_cg")
tfidf_ridge_clf = tfidf_ridge_clf.fit(X_train_tfidf_vector, y_train)
y_pred = tfidf_ridge_clf.predict(X_test_tfidf_vector)
print(metrics.classification_report(y_test,y_pred, digits = 4))

              precision    recall  f1-score   support

           1     0.8174    0.8004    0.8088       481
           2     0.8192    0.8892    0.8528       316
           3     0.9294    0.9377    0.9335       674
           4     0.9109    0.9594    0.9345       863
           5     0.9092    0.9467    0.9276       920
           6     0.9188    0.9247    0.9218       465
           7     0.8714    0.9096    0.8901       730
           8     0.7186    0.6006    0.6543       353
           9     0.8233    0.7957    0.8093       328
          10     0.6990    0.5625    0.6234       256
          11     0.8690    0.8615    0.8652       462
          12     0.8836    0.7696    0.8227       217
          13     0.8521    0.8713    0.8616       443
          14     0.9423    0.9316    0.9369       263
          15     0.9112    0.8850    0.8979       313
          16     0.9513    0.9612    0.9562      1057

    accuracy                         0.8840      8141
   macro avg     0.8642   

**Exercise 4.4**

Implementations may vary. One example is given below:

In [7]:
def ClassifyDocs(data, classifier_algorithm, params):
    docs = data.text
    categories = data.sdg
    X_train, X_test, y_train, y_test = train_test_split(docs, categories, test_size=0.33, random_state=7)

    X_train_tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,1), stop_words = "english", min_df=5)
    X_train_tfidf_vectorizer.fit(X_train)
    X_train_tfidf_vector = X_train_tfidf_vectorizer.transform(X_train)
    X_test_tfidf_vector = X_train_tfidf_vectorizer.transform(X_test)
    
    if classifier_algorithm == "mlp":
        tfidf_mlp_clf = MLPClassifier(random_state=params[0], max_iter=params[1]).fit(X_train_tfidf_vector, y_train)
        y_pred = tfidf_mlp_clf.predict(X_test_tfidf_vector)
        tfidf_mlp_clf.score(X_test_tfidf_vector, y_test)
        print(metrics.classification_report(y_test,y_pred))

    elif classifier_algorithm == "multinomialNB":
        tfidf_multinomialNB_clf = MultinomialNB().fit(X_train_tfidf_vector, y_train)
        y_pred = tfidf_multinomialNB_clf.predict(X_test_tfidf_vector)
        print(metrics.classification_report(y_test,y_pred))

    elif classifier_algorithm == "ridge":
        tfidf_ridge_clf = RidgeClassifier(tol=params[0], solver=params[1])
        tfidf_ridge_clf = tfidf_ridge_clf.fit(X_train_tfidf_vector, y_train)
        y_pred = tfidf_ridge_clf.predict(X_test_tfidf_vector)
        print(metrics.classification_report(y_test,y_pred, digits = 4))

    else:
        print("Invalid classifier algorithm")

**Exercise 4.5**

In [8]:
print("MLP")
ClassifyDocs(text_df, "mlp", [1,100])
print("MultinomialNB")
ClassifyDocs(text_df, "multinomialNB", [])
print("Ridge")
class_params = [1e-2, "sparse_cg"]
ClassifyDocs(text_df, "ridge", class_params)

MLP
              precision    recall  f1-score   support

           1       0.79      0.81      0.80       481
           2       0.82      0.87      0.84       316
           3       0.94      0.93      0.94       674
           4       0.92      0.94      0.93       863
           5       0.92      0.92      0.92       920
           6       0.90      0.91      0.91       465
           7       0.88      0.88      0.88       730
           8       0.68      0.61      0.64       353
           9       0.79      0.80      0.79       328
          10       0.64      0.58      0.61       256
          11       0.85      0.88      0.86       462
          12       0.85      0.78      0.81       217
          13       0.85      0.87      0.86       443
          14       0.93      0.92      0.93       263
          15       0.89      0.85      0.87       313
          16       0.97      0.97      0.97      1057

    accuracy                           0.88      8141
   macro avg       0.8

**Exercise 5.1**

In [9]:
from sklearn import preprocessing
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import LabelBinarizer

import tensorflow as tf
import tensorflow_hub as hub

# change this to your own embedding directory
embedding_dir = "embeddings/"

# load the embedding
embed = hub.load(embedding_dir + "universal-sentence-encoder_4")

text_df["embedding"] = list(embed(text_df.text))

file_name = "sdg_names_definitions.csv"
sdg_names = pd.read_csv(data_dir + file_name)

docs = text_df.embedding.tolist()
scaler = preprocessing.MinMaxScaler().fit(docs)
X = scaler.transform(docs)
y = text_df.sdg

label_binarizer = LabelBinarizer().fit(y)
y_onehot = label_binarizer.transform(y)
n_classes = len(label_binarizer.classes_)
class_names = [sdg_names[sdg_names["sdg"] == label_binarizer.classes_[i]].sdg_name.item() \
               for i in range(n_classes)]

X_train, X_test, y_train, y_test = train_test_split(X, y_onehot, test_size=.33, random_state=0)
ovr_mlp_clf = OneVsRestClassifier(MLPClassifier(random_state=0, max_iter=300)).fit(X_train,y_train)
y_score = ovr_mlp_clf.predict_proba(X_test)

AttributeError: 'DataFrame' object has no attribute 'embedding'