# Import Dataset

In [9]:
import gc
gc.collect()
import subprocess as sp
import os
def get_gpu_memory():
    command = "nvidia-smi --query-gpu=memory.free --format=csv"
    memory_free_info = sp.check_output(command.split()).decode('ascii').split('\n')[:-1][1:]
    memory_free_values = [int(x.split()[0]) for i, x in enumerate(memory_free_info)]
    return memory_free_values

get_gpu_memory()

[11170]

In [10]:
import json
from nltk.corpus import stopwords
import re
import json
from sklearn.utils import shuffle
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
with open('test_dataset.json', 'r') as fp:
    test_dataset = json.load(fp)
with open('train_dataset.json', 'r') as fp:
    train_dataset = json.load(fp)

In [11]:
f = open('/data/data_codebook.json')
data_codebook = json.load(f)
super_set={}
for s in data_codebook:
    if s[2]!="domain_name":
        if s[2] not in super_set:
            super_set[s[2]]=[]
        if s[5] not in super_set[s[2]]:
            super_set[s[2]].append(s[5])

In [12]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score, balanced_accuracy_score, roc_auc_score, classification_report, confusion_matrix
sw = stopwords.words('english')

def clean_text(text):
    
    text = text.lower()
    
    text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text) # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")

    text = re.sub(r"http\S+", "",text) #Removing URLs 
    #text = re.sub(r"http", "",text)
    
    html=re.compile(r'<.*?>') 
    
    text = html.sub(r'',text) #Removing html tags
    
    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
    for p in punctuations:
        text = text.replace(p,'') #Removing punctuations
        
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    
    text = " ".join(text) #removing stopwords
    
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text) #Removing emojis
    
    return text

In [23]:
def extract_welfare_categories(dataset):
    welfare_categories = {}
    for key, value in super_set.items():
        if key != 'Welfare and Quality of Life':
            continue
        for category in value:
            welfare_categories[category] =dataset[category]
    
    return welfare_categories

welfare_categories_train=extract_welfare_categories(train_dataset)
welfare_categories_test=extract_welfare_categories(test_dataset)

In [24]:
def calculate_dataset_counts(dataset):
    count_dataset = {
        "general": {},
        "detailed": {}
    }
    
    for s in dataset:
        detailed_label = s['detailed_label']
        general_label = s["general_label"]
        
        if detailed_label not in count_dataset["detailed"]:
            count_dataset["detailed"][detailed_label] = 0
        count_dataset["detailed"][detailed_label] += 1

        if general_label not in count_dataset["general"]:
            count_dataset["general"][general_label] = 0
        count_dataset["general"][general_label] += 1
    
    return count_dataset

In [25]:
def create_custom_dataset(balanced_dataset, total_limit, desired_percentage):
    used_sentences = set()
    dataset = []
    welfare_count = 0
    welfare_limit = int(total_limit * desired_percentage)
    category_limits = {category: 100 for category in balanced_dataset.keys()}

    # Calculate the total number of sentences in the dataset
    total_sentences = sum(len(v) for v in balanced_dataset.values())

    # First, add sentences from 'Environmental Protection'
    for sentence in balanced_dataset['Environmental Protection']:
        if len(dataset) >= welfare_limit:
            break

        if sentence not in used_sentences:
            per_line_dict = {
                "sentence": sentence,
                "detailed_label": 'Environmental Protection',
                "general_label": 'Welfare and Quality of Life'
            }
            dataset.append(per_line_dict)
            used_sentences.add(sentence)
            welfare_count += 1

    # Then, add sentences from other categories
    while len(dataset) < total_limit:
        for category, sentences in balanced_dataset.items():
            if len(dataset) >= total_limit:
                break
            if category != 'Environmental Protection':
                for key, value in super_set.items():
                    if category in value:
                        super_label = key

                category_limit = min(len(sentences), category_limits[category])

                limit = min(100, category_limit)

                for sentence in sentences:
                    if limit == 0 or len(dataset) >= total_limit:
                        break

                    if sentence not in used_sentences:
                        per_line_dict = {
                            "sentence": sentence,
                            "detailed_label": category,
                            "general_label": super_label
                        }
                        dataset.append(per_line_dict)
                        used_sentences.add(sentence)
                        limit -= 1
                        category_limits[category] += 100

    return dataset, welfare_count


In [57]:
"Experiment 1"
total_limit = 56000
desired_percentage = 0.4

dataset, welfare_count = create_custom_dataset(welfare_categories_train, total_limit, desired_percentage)

print("Total sentences in the dataset:", len(dataset))
print("Sentences with 'Welfare and Quality of Life' label:", welfare_count)

# Usage example:
# Assuming you have the 'dataset' variable containing the dataset obtained from the create_custom_dataset function
# Replace this with the actual dataset you want to count.
count_dataset = calculate_dataset_counts(dataset)
print(count_dataset["general"])
print(sum(count_dataset["general"].values()))
print(len(count_dataset["detailed"].keys()))
print(count_dataset["detailed"])

Total sentences in the dataset: 56000
Sentences with 'Welfare and Quality of Life' label: 22400
{'Welfare and Quality of Life': 56000}
56000
11
{'Environmental Protection': 22400, 'Culture: Positive': 6900, 'Equality: Positive': 6894, 'Welfare State Expansion': 6800, 'Welfare State Limitation': 5380, 'Education Expansion': 6800, 'Education Limitation': 515, 'Private-Public Mix in Culture: Positive': 36, 'Private-Public Mix in Social Justice: Positive': 1, 'Private-Public Mix in Welfare: Positive': 227, 'Private-Public Mix in Education: Positive': 47}


In [52]:
def text_classification(dataframe, test_data):
    dataframe = pd.DataFrame(data=(dataframe))
    dataframe=shuffle(dataframe).dropna()
    train_sentences=list(dataframe["sentence"])
    train_labels=[]
    for s in list(dataframe["detailed_label"]):
        if s=='Environmental Protection':
            number=1
        else:
            number=0
        train_labels.append(number)
    len(train_sentences)==len(train_labels)

    train_sentences_cleaned=[]
    for s in list(dataframe["sentence"]):
        cleaned=clean_text(s)
        train_sentences_cleaned.append(cleaned)


    X_train, X_test, y_train, y_test = train_test_split(train_sentences_cleaned, train_labels, test_size=0.3, random_state=42)

    # Vectorize the text data using TF-IDF
    vectorizer = TfidfVectorizer()
    X_train_vectorized = vectorizer.fit_transform(X_train)
    X_test_vectorized = vectorizer.transform(X_test)  # Use transform, not fit_transform

    # Train the models
    logreg_model = LogisticRegression(max_iter=1000)
    gnb_model = GaussianNB()
    svc_model = LinearSVC()

    logreg_model.fit(X_train_vectorized, y_train)
    gnb_model.fit(X_train_vectorized.toarray(), y_train)
    svc_model.fit(X_train_vectorized, y_train)

    # Predict on the test set
    logreg_pred = logreg_model.predict(X_test_vectorized)
    gnb_pred = gnb_model.predict(X_test_vectorized.toarray())
    svc_pred = svc_model.predict(X_test_vectorized)

    # Evaluate accuracy
    logreg_accuracy = accuracy_score(y_test, logreg_pred)
    gnb_accuracy = accuracy_score(y_test, gnb_pred)
    svc_accuracy = accuracy_score(y_test, svc_pred)

    # Evaluate F1-score
    logreg_f1_score = f1_score(y_test, logreg_pred, average='weighted')
    gnb_f1_score = f1_score(y_test, gnb_pred, average='weighted')
    svc_f1_score = f1_score(y_test, svc_pred, average='weighted')

    # Evaluate balanced accuracy
    logreg_balanced_accuracy = balanced_accuracy_score(y_test, logreg_pred)
    gnb_balanced_accuracy = balanced_accuracy_score(y_test, gnb_pred)
    svc_balanced_accuracy = balanced_accuracy_score(y_test, svc_pred)

    # Evaluate ROC-AUC score
    logreg_roc_auc_score = roc_auc_score(y_test, logreg_model.predict_proba(X_test_vectorized)[:, 1], multi_class='ovr')
    gnb_roc_auc_score = roc_auc_score(y_test, gnb_model.predict_proba(X_test_vectorized.toarray())[:, 1], multi_class='ovr')
    svc_roc_auc_score = roc_auc_score(y_test, svc_model.decision_function(X_test_vectorized), multi_class='ovr')

    # Print evaluation metrics for Logistic Regression
    print("Logistic Regression Accuracy:", logreg_accuracy)
    print("Logistic Regression F1-Score:", logreg_f1_score)
    print("Logistic Regression Balanced Accuracy:", logreg_balanced_accuracy)
    print("Logistic Regression ROC-AUC Score:", logreg_roc_auc_score)
    print()

    # Print evaluation metrics for Gaussian Naive Bayes
    print("Gaussian Naive Bayes Accuracy:", gnb_accuracy)
    print("Gaussian Naive Bayes F1-Score:", gnb_f1_score)
    print("Gaussian Naive Bayes Balanced Accuracy:", gnb_balanced_accuracy)
    print("Gaussian Naive Bayes ROC-AUC Score:", gnb_roc_auc_score)
    print()

    # Print evaluation metrics for Linear SVC
    print("Linear SVC Accuracy:", svc_accuracy)
    print("Linear SVC F1-Score:", svc_f1_score)
    print("Linear SVC Balanced Accuracy:", svc_balanced_accuracy)
    print("Linear SVC ROC-AUC Score:", svc_roc_auc_score)
    print()

    # Print classification report and confusion matrix for Linear SVC
    print("Linear SVC Classification Report:")
    print(classification_report(y_test, svc_pred))

    print("Linear SVC Confusion Matrix:")
    print(confusion_matrix(y_test, svc_pred))

    print("-----Now test the model on another unseen test sets--------")
    test_dataframe=[]
    for k,v in test_data.items():
        for key, value in super_set.items():
            if k in value:
                super_label = key
        for s in v:
                per_line_dict = {}
                per_line_dict["sentence"] = s
                per_line_dict["detailed_label"] = k
                per_line_dict["general_label"] = super_label
                test_dataframe.append(per_line_dict)
                
    test_dataframe = pd.DataFrame(data=(test_dataframe))
    test_dataframe=shuffle(test_dataframe).dropna()
    test_sentences_cleaned=[]
    for s in list(test_dataframe["sentence"]):
        cleaned=clean_text(s)
        test_sentences_cleaned.append(cleaned)
        
    y_test=[]
    for s in list(test_dataframe["detailed_label"]):
        if s=='Environmental Protection':
            number=1
        else:
            number=0
        y_test.append(number)


    # Vectorize the text data using TF-IDF
    X_vectorized = vectorizer.transform(test_sentences_cleaned)
    svc_pred = svc_model.predict(X_vectorized)

    # Evaluate accuracy
    svc_accuracy = accuracy_score(y_test, svc_pred)

    # Evaluate F1-score
    svc_f1_score = f1_score(y_test, svc_pred, average='weighted')

    # Evaluate balanced accuracy
    svc_balanced_accuracy = balanced_accuracy_score(y_test, svc_pred)

    # Evaluate ROC-AUC score
    svc_roc_auc_score = roc_auc_score(y_test, svc_model.decision_function(X_vectorized), multi_class='ovr')

    # Print evaluation metrics for Linear SVC on the unseen test set
    print("Linear SVC Accuracy on Unseen Test Set:", round(svc_accuracy,2))
    print("Linear SVC F1-Score on Unseen Test Set:", round(svc_f1_score,2))
    print("Linear SVC Balanced Accuracy on Unseen Test Set:", round(svc_balanced_accuracy,2))
    print("Linear SVC ROC-AUC Score on Unseen Test Set:", round(svc_roc_auc_score,2))
    print()

    # Print classification report and confusion matrix for Linear SVC on the unseen test set
    print("Linear SVC Classification Report on Unseen Test Set:")
    print(classification_report(y_test, svc_pred))

    print("Linear SVC Confusion Matrix on Unseen Test Set:")
    print(confusion_matrix(y_test, svc_pred))

In [53]:
text_classification(dataset,welfare_categories_test)

Logistic Regression Accuracy: 0.9501190476190476
Logistic Regression F1-Score: 0.9445570925210562
Logistic Regression Balanced Accuracy: 0.773473679913486
Logistic Regression ROC-AUC Score: 0.9648350141308188

Gaussian Naive Bayes Accuracy: 0.6739880952380952
Gaussian Naive Bayes F1-Score: 0.7391939510287234
Gaussian Naive Bayes Balanced Accuracy: 0.70426949419908
Gaussian Naive Bayes ROC-AUC Score: 0.70426949419908

Linear SVC Accuracy: 0.9592261904761905
Linear SVC F1-Score: 0.9572056571399592
Linear SVC Balanced Accuracy: 0.8421455198043682
Linear SVC ROC-AUC Score: 0.961747305508553

Linear SVC Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     15129
           1       0.87      0.70      0.77      1671

    accuracy                           0.96     16800
   macro avg       0.92      0.84      0.88     16800
weighted avg       0.96      0.96      0.96     16800

Linear SVC Confusion Matrix:
[[14952   177]
