# Import Dataset

In [817]:
import gc
gc.collect()
import subprocess as sp
import os
def get_gpu_memory():
    command = "nvidia-smi --query-gpu=memory.free --format=csv"
    memory_free_info = sp.check_output(command.split()).decode('ascii').split('\n')[:-1][1:]
    memory_free_values = [int(x.split()[0]) for i, x in enumerate(memory_free_info)]
    return memory_free_values

get_gpu_memory()

[11176]

In [818]:
import json
from nltk.corpus import stopwords
import re
import json
from sklearn.utils import shuffle
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
with open('test_dataset.json', 'r') as fp:
    test_dataset = json.load(fp)
with open('train_dataset.json', 'r') as fp:
    train_dataset = json.load(fp)
f = open('/data/data_codebook.json')
data_codebook = json.load(f)
super_set={}
for s in data_codebook:
    if s[2]!="domain_name":
        if s[2] not in super_set:
            super_set[s[2]]=[]
        if s[5] not in super_set[s[2]]:
            super_set[s[2]].append(s[5])

In [819]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score, balanced_accuracy_score, roc_auc_score, classification_report, confusion_matrix
sw = stopwords.words('english')

def clean_text(text):
    
    text = text.lower()
    
    text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text) # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")

    text = re.sub(r"http\S+", "",text) #Removing URLs 
    #text = re.sub(r"http", "",text)
    
    html=re.compile(r'<.*?>') 
    
    text = html.sub(r'',text) #Removing html tags
    
    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
    for p in punctuations:
        text = text.replace(p,'') #Removing punctuations
        
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    
    text = " ".join(text) #removing stopwords
    
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text) #Removing emojis
    
    return text

In [820]:
def calculate_dataset_counts(dataset):
    count_dataset = {
        "general": {},
        "detailed": {}
    }
    
    for s in dataset:
        detailed_label = s['detailed_label']
        general_label = s["general_label"]
        
        if detailed_label not in count_dataset["detailed"]:
            count_dataset["detailed"][detailed_label] = 0
        count_dataset["detailed"][detailed_label] += 1

        if general_label not in count_dataset["general"]:
            count_dataset["general"][general_label] = 0
        count_dataset["general"][general_label] += 1
    
    return count_dataset

In [821]:
def create_custom_dataset(balanced_dataset, total_limit, desired_percentage):
    used_sentences = set()
    dataset = []
    super_category_counts = {super_category: 0 for super_category in super_set.keys()}
    category_counts = {category: 0 for category in balanced_dataset.keys()}

    # Calculate the limit for 'Welfare and Quality of Life' based on desired_percentage
    welfare_category = 'Welfare and Quality of Life'
    welfare_desired_limit = int(total_limit * desired_percentage)
    # Calculate the total number of sentences in the 'Welfare and Quality of Life' super-category
    welfare_super_category_sentences = sum(len(balanced_dataset[category]) for category in super_set[welfare_category] if category in balanced_dataset)
    # Calculate the limits for categories within the 'Welfare and Quality of Life' super-category
    welfare_category_limits = {}
    for category in super_set[welfare_category]:
        if category in balanced_dataset:
            category_percentage = len(balanced_dataset[category]) / welfare_super_category_sentences
            welfare_category_limits[category] = int(welfare_desired_limit * category_percentage)
    
    # Process 'Welfare and Quality of Life' category first
    welfare_categories = super_set[welfare_category]
    #print(welfare_super_category_sentences)
    for category in welfare_categories:
        if category not in balanced_dataset:
            continue
        
        sentences = balanced_dataset[category]
        category_limit = welfare_category_limits[category]
        for sentence in sentences:
            if len(dataset) >= welfare_desired_limit or len(dataset) >= total_limit:
                break

            if sentence not in used_sentences and super_category_counts[welfare_category] < welfare_desired_limit and category_limit > 0:
                per_line_dict = {
                    "sentence": sentence,
                    "detailed_label": category,
                    "general_label": welfare_category
                }
                dataset.append(per_line_dict)
                used_sentences.add(sentence)
                super_category_counts[welfare_category] += 1
                category_limit -= 1    
                
    def get_max_value_key_and_value(dictionary):
        max_key = max(dictionary, key=dictionary.get)
        max_value = dictionary[max_key]
        return max_key, max_value
    
    if len(dataset)<welfare_desired_limit:
        w_max_key, w_max_value=get_max_value_key_and_value(welfare_category_limits)
        missing_part=welfare_desired_limit-len(dataset)
        for sentence in balanced_dataset[w_max_key]:
            if  missing_part>0:
                if sentence not in used_sentences and super_category_counts[welfare_category] < welfare_desired_limit:
                    per_line_dict = {
                        "sentence": sentence,
                        "detailed_label": w_max_key,
                        "general_label": welfare_category
                    }
                    dataset.append(per_line_dict)
                    used_sentences.add(sentence)
                    super_category_counts[welfare_category] += 1
                    missing_part-= 1
    # Calculate the limit for other super categories
    other_desired_limit = total_limit - welfare_desired_limit
    
    def get_length(d):
        ''' Return length of all dict values'''
        return sum(len(v) for k, v in d.items())
    
    total_length=get_length(balanced_dataset)
    non_welfare_sentences_count=total_length-welfare_super_category_sentences
    non_welfare_category_limits = {}
    for super_category, categories in super_set.items(): 
        if super_category==welfare_category:
            continue
        for category in categories:
            if category not in balanced_dataset:
                continue
            non_welfare_category_percentage = len(balanced_dataset[category]) / non_welfare_sentences_count
            non_welfare_category_limits[category] = int(other_desired_limit * non_welfare_category_percentage)
    for super_category, categories in super_set.items(): 
        if super_category==welfare_category:
            continue
            
        for category in categories:
            if category not in balanced_dataset:
                continue
            sentences = balanced_dataset[category]
            category_limit = non_welfare_category_limits[category]
            for sentence in sentences:
                
                if sentence not in used_sentences and super_category_counts[super_category] < other_desired_limit and category_limit > 0:
                    
                    per_line_dict = {
                        "sentence": sentence,
                        "detailed_label": category,
                        "general_label": super_category
                    }
                    dataset.append(per_line_dict)
                    used_sentences.add(sentence)
                    super_category_counts[super_category] += 1
                    category_limit -= 1
    
    if len(dataset)< total_limit:
        wn_max_key, wn_max_value=get_max_value_key_and_value(non_welfare_category_limits)
        for super_category, categories in super_set.items(): 
            if wn_max_key in categories:
                additional_super_category=super_category
        missing_part=total_limit-len(dataset)
        for sentence in balanced_dataset[wn_max_key]:
            if  missing_part>0:
                if sentence not in used_sentences:
                    per_line_dict = {
                        "sentence": sentence,
                        "detailed_label": wn_max_key,
                        "general_label": additional_super_category
                    }
                    dataset.append(per_line_dict)
                    used_sentences.add(sentence)
                    super_category_counts[additional_super_category] += 1
                    missing_part-= 1
    return dataset

In [844]:
"Experiment 1"
total_limit = 100000
desired_percentage = 0.10
dataset = create_custom_dataset(train_dataset, total_limit, desired_percentage)

print("Total sentences in the dataset:", len(dataset))

count_dataset = calculate_dataset_counts(dataset)
print(count_dataset["general"])
print("Detailed categories inside general category:")
for k,v in super_set.items():
    print("---------------------------")
    print("General label:",k)
    for c in v:
        if c in count_dataset["detailed"]:
            print(c,":",count_dataset["detailed"][c])

Total sentences in the dataset: 100000
{'Welfare and Quality of Life': 10000, 'External Relations': 10507, 'Freedom and Democracy': 8570, 'Political System': 13519, 'Economy': 34878, 'Fabric of Society': 15369, 'Social Groups': 7157}
Detailed categories inside general category:
---------------------------
General label: NA
---------------------------
General label: External Relations
Foreign Special Relationships: Positive : 385
Foreign Special Relationships: Negative : 136
Anti-Imperialism : 62
Military: Positive : 2160
Military: Negative : 518
Peace : 531
Internationalism: Positive : 3567
European Community/Union: Positive : 2164
Internationalism: Negative : 260
European Community/Union: Negative : 595
Russia/USSR/CIS: Positive : 3
Western States: Positive : 3
SFR Yugoslavia: Positive : 7
Independence: Positive : 5
Anti-Imperialism: Foreign Financial Influence : 111
---------------------------
General label: Freedom and Democracy
Freedom and Human Rights : 1850
Democracy : 1468
Const

In [845]:
def text_classification(dataframe, test_data):
    dataframe = pd.DataFrame(data=(dataframe))
    dataframe=shuffle(dataframe).dropna()
    train_sentences=list(dataframe["sentence"])
    train_labels=[]
    for s in list(dataframe['general_label']):
        if s=='Welfare and Quality of Life':
            number=1
        else:
            number=0
        train_labels.append(number)
    len(train_sentences)==len(train_labels)

    train_sentences_cleaned=[]
    for s in list(dataframe["sentence"]):
        cleaned=clean_text(s)
        train_sentences_cleaned.append(cleaned)


    X_train, X_test, y_train, y_test = train_test_split(train_sentences_cleaned, train_labels, test_size=0.3, random_state=42)

    # Vectorize the text data using TF-IDF
    vectorizer = TfidfVectorizer()
    X_train_vectorized = vectorizer.fit_transform(X_train)
    X_test_vectorized = vectorizer.transform(X_test)  # Use transform, not fit_transform

    # Train the models
    logreg_model = LogisticRegression(max_iter=1000)
    gnb_model = GaussianNB()
    svc_model = LinearSVC()

    logreg_model.fit(X_train_vectorized, y_train)
    gnb_model.fit(X_train_vectorized.toarray(), y_train)
    svc_model.fit(X_train_vectorized, y_train)

    # Predict on the test set
    logreg_pred = logreg_model.predict(X_test_vectorized)
    gnb_pred = gnb_model.predict(X_test_vectorized.toarray())
    svc_pred = svc_model.predict(X_test_vectorized)

    # Evaluate accuracy
    logreg_accuracy = accuracy_score(y_test, logreg_pred)
    gnb_accuracy = accuracy_score(y_test, gnb_pred)
    svc_accuracy = accuracy_score(y_test, svc_pred)

    # Evaluate F1-score
    logreg_f1_score = f1_score(y_test, logreg_pred, average='weighted')
    gnb_f1_score = f1_score(y_test, gnb_pred, average='weighted')
    svc_f1_score = f1_score(y_test, svc_pred, average='weighted')

    # Evaluate balanced accuracy
    logreg_balanced_accuracy = balanced_accuracy_score(y_test, logreg_pred)
    gnb_balanced_accuracy = balanced_accuracy_score(y_test, gnb_pred)
    svc_balanced_accuracy = balanced_accuracy_score(y_test, svc_pred)

    # Evaluate ROC-AUC score
    logreg_roc_auc_score = roc_auc_score(y_test, logreg_model.predict_proba(X_test_vectorized)[:, 1], multi_class='ovr')
    gnb_roc_auc_score = roc_auc_score(y_test, gnb_model.predict_proba(X_test_vectorized.toarray())[:, 1], multi_class='ovr')
    svc_roc_auc_score = roc_auc_score(y_test, svc_model.decision_function(X_test_vectorized), multi_class='ovr')

    # Print evaluation metrics for Logistic Regression
    print("Logistic Regression Accuracy:", logreg_accuracy)
    print("Logistic Regression F1-Score:", logreg_f1_score)
    print("Logistic Regression Balanced Accuracy:", logreg_balanced_accuracy)
    print("Logistic Regression ROC-AUC Score:", logreg_roc_auc_score)
    print()

    # Print evaluation metrics for Gaussian Naive Bayes
    print("Gaussian Naive Bayes Accuracy:", gnb_accuracy)
    print("Gaussian Naive Bayes F1-Score:", gnb_f1_score)
    print("Gaussian Naive Bayes Balanced Accuracy:", gnb_balanced_accuracy)
    print("Gaussian Naive Bayes ROC-AUC Score:", gnb_roc_auc_score)
    print()

    # Print evaluation metrics for Linear SVC
    print("Linear SVC Accuracy:", svc_accuracy)
    print("Linear SVC F1-Score:", svc_f1_score)
    print("Linear SVC Balanced Accuracy:", svc_balanced_accuracy)
    print("Linear SVC ROC-AUC Score:", svc_roc_auc_score)
    print()

    # Print classification report and confusion matrix for Linear SVC
    print("Linear SVC Classification Report:")
    print(classification_report(y_test, svc_pred))

    print("Linear SVC Confusion Matrix:")
    print(confusion_matrix(y_test, svc_pred))

    print("-----Now test the model on another unseen test sets--------")
    test_dataframe=[]
    for k,v in test_data.items():
        for key, value in super_set.items():
            if k in value:
                super_label = key
        for s in v:
                per_line_dict = {}
                per_line_dict["sentence"] = s
                per_line_dict["detailed_label"] = k
                per_line_dict["general_label"] = super_label
                test_dataframe.append(per_line_dict)
                
    test_dataframe = pd.DataFrame(data=(test_dataframe))
    test_dataframe=shuffle(test_dataframe).dropna()
    test_sentences_cleaned=[]
    for s in list(test_dataframe["sentence"]):
        cleaned=clean_text(s)
        test_sentences_cleaned.append(cleaned)
        
    y_test=[]
    for s in list(test_dataframe['general_label']):
        if s=='Welfare and Quality of Life':
            number=1
        else:
            number=0
        y_test.append(number)


    # Vectorize the text data using TF-IDF
    X_vectorized = vectorizer.transform(test_sentences_cleaned)
    svc_pred = svc_model.predict(X_vectorized)

    # Evaluate accuracy
    svc_accuracy = accuracy_score(y_test, svc_pred)

    # Evaluate F1-score
    svc_f1_score = f1_score(y_test, svc_pred, average='weighted')

    # Evaluate balanced accuracy
    svc_balanced_accuracy = balanced_accuracy_score(y_test, svc_pred)

    # Evaluate ROC-AUC score
    svc_roc_auc_score = roc_auc_score(y_test, svc_model.decision_function(X_vectorized), multi_class='ovr')

    # Print evaluation metrics for Linear SVC on the unseen test set
    print("Linear SVC Accuracy on Unseen Test Set:", round(svc_accuracy,2))
    print("Linear SVC F1-Score on Unseen Test Set:", round(svc_f1_score,2))
    print("Linear SVC Balanced Accuracy on Unseen Test Set:", round(svc_balanced_accuracy,2))
    print("Linear SVC ROC-AUC Score on Unseen Test Set:", round(svc_roc_auc_score,2))
    print()

    # Print classification report and confusion matrix for Linear SVC on the unseen test set
    print("Linear SVC Classification Report on Unseen Test Set:")
    print(classification_report(y_test, svc_pred))

    print("Linear SVC Confusion Matrix on Unseen Test Set:")
    print(confusion_matrix(y_test, svc_pred))

In [846]:
text_classification(dataset,test_dataset)

Logistic Regression Accuracy: 0.9059
Logistic Regression F1-Score: 0.8775483904292336
Logistic Regression Balanced Accuracy: 0.5603305378535451
Logistic Regression ROC-AUC Score: 0.8295013984873945

Gaussian Naive Bayes Accuracy: 0.4623333333333333
Gaussian Naive Bayes F1-Score: 0.5553252495308383
Gaussian Naive Bayes Balanced Accuracy: 0.5736816046493882
Gaussian Naive Bayes ROC-AUC Score: 0.5736816046493882

Linear SVC Accuracy: 0.9039666666666667
Linear SVC F1-Score: 0.8833842684687337
Linear SVC Balanced Accuracy: 0.5885425384675644
Linear SVC ROC-AUC Score: 0.802865525156919

Linear SVC Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.98      0.95     27049
           1       0.53      0.20      0.29      2951

    accuracy                           0.90     30000
   macro avg       0.73      0.59      0.62     30000
weighted avg       0.88      0.90      0.88     30000

Linear SVC Confusion Matrix:
[[26541   508]
 [ 2373