In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import torch
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.preprocessing import LabelEncoder

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_csv('/Users/mac/Desktop/MasterClass/PPNCKH/Data/train.csv', names=["document", "class"], header=0)

# Split the data into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [3]:
train_data_cl = train_data[train_data["class"]=='CL'].iloc[:100].reset_index()
train_data_cr = train_data[train_data["class"]=='CR'].iloc[:100].reset_index()
train_data_dc = train_data[train_data["class"]=='DC'].iloc[:100].reset_index()
train_data_ds = train_data[train_data["class"]=='DS'].iloc[:100].reset_index()
train_data_lo = train_data[train_data["class"]=='LO'].iloc[:100].reset_index()
train_data_ni = train_data[train_data["class"]=='NI'].iloc[:100].reset_index()
train_data_se = train_data[train_data["class"]=='SE'].iloc[:100].reset_index()
train_data = pd.concat([train_data_cl,train_data_cr,train_data_dc,train_data_ds,train_data_lo,train_data_ni,train_data_se],ignore_index=True).drop(["index"], axis="columns")
train_data


Unnamed: 0,document,class
0,"In this doctoral thesis, we apply premises o...",CL
1,We describe an LSTM-based model which we cal...,CL
2,We consider the cross-domain sentiment class...,CL
3,In this paper we present the approach of int...,CL
4,Generative Adversarial Networks (GANs) have ...,CL
...,...,...
695,Inter-package conflicts require the presence...,SE
696,Background: To adequately attend to non-func...,SE
697,Background: Bots help automate many of the t...,SE
698,One major problem in maintaining a software ...,SE


In [4]:
test_data_cl = test_data[test_data["class"]=='CL'].iloc[:20].reset_index()
test_data_cr = test_data[test_data["class"]=='CR'].iloc[:20].reset_index()
test_data_dc = test_data[test_data["class"]=='DC'].iloc[:20].reset_index()
test_data_ds = test_data[test_data["class"]=='DS'].iloc[:20].reset_index()
test_data_lo = test_data[test_data["class"]=='LO'].iloc[:20].reset_index()
test_data_ni = test_data[test_data["class"]=='NI'].iloc[:20].reset_index()
test_data_se = test_data[test_data["class"]=='SE'].iloc[:20].reset_index()
test_data = pd.concat([test_data_cl,test_data_cr,test_data_dc,test_data_ds,test_data_lo,test_data_ni,test_data_se], ignore_index=False).drop(["index"], axis="columns")
test_data

Unnamed: 0,document,class
0,Humor is a defining characteristic of human ...,CL
1,Auto-encoders compress input data into a lat...,CL
2,"Many common character-level, string-to-strin...",CL
3,Frequency is one of the major factors for tr...,CL
4,Centrality of emotion for the stories told b...,CL
...,...,...
15,The context-awareness of things that belong ...,SE
16,Automatic test data generation (ATG) is a ma...,SE
17,Non-functional requirements (NFRs) are deter...,SE
18,"In the context of robustness testing, the bo...",SE


In [5]:
# Preprocessing for TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_train = tfidf_vectorizer.fit_transform(train_data['document'])
tfidf_test = tfidf_vectorizer.transform(test_data['document'])

# Train a Logistic Regression model on TF-IDF
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(tfidf_train, train_data['class'])
lr_predictions = lr_model.predict(tfidf_test)


In [6]:
# Load and tokenize the RoBERTa model
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=7)
device = torch.device("cpu")
roberta_model.to(device)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [7]:
# Preprocessing for RoBERTa
def tokenize_text(texts, tokenizer, max_length):
    inputs = tokenizer(texts, padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')
    return inputs

max_length = 128
batch_size = 16

# Preprocess training data for RoBERTa
roberta_train_inputs = tokenize_text(train_data['document'].tolist(), roberta_tokenizer, max_length)
with torch.no_grad():
    roberta_train_outputs = roberta_model(**roberta_train_inputs.to(device), output_hidden_states= True)[0].cpu().detach().numpy()



In [8]:
# Load LDA model
lda_model = LatentDirichletAllocation(n_components=10, random_state=42)  # Adjust the number of components as needed
lda_topics_train = lda_model.fit_transform(tfidf_train)

# Combine RoBERTa embeddings with LDA topics
combined_features_train = np.concatenate((roberta_train_outputs, lda_topics_train), axis=1)
# Train a Logistic Regression model on combined features
lr_combined_model = LogisticRegression(max_iter=1000)
lr_combined_model.fit(combined_features_train, train_data['class'])

In [9]:
# Preprocess test data for RoBERTa and LDA
roberta_test_inputs = tokenize_text(test_data['document'].tolist(), roberta_tokenizer, max_length)
with torch.no_grad():
    roberta_test_outputs = roberta_model(**roberta_test_inputs.to(device), output_hidden_states= True)[0].cpu().detach().numpy()

lda_topics_test = lda_model.transform(tfidf_test)


In [10]:
# Combine test features
combined_features_test = np.concatenate((roberta_test_outputs, lda_topics_test), axis=1)

# Make predictions using the ensemble model
lr_predictions = lr_model.predict(tfidf_test)
lr_combined_predictions = lr_combined_model.predict(combined_features_test)

In [11]:
def majority_vote(predictions_list, label_encoder):
    final_predictions = []
    for i in range(len(predictions_list[0])):
        counts = np.bincount([label_encoder.transform([preds[i]])[0] for preds in predictions_list])
        final_predictions.append(np.argmax(counts))
    return final_predictions

from sklearn.metrics import classification_report

# Create a LabelEncoder and fit it on the train labels
label_encoder = LabelEncoder()
label_encoder.fit(train_data['class'])

# Use the majority_vote function
final_predictions = majority_vote([lr_predictions, lr_combined_predictions], label_encoder)

# Transform test labels using the label_encoder
test_labels_encoded = label_encoder.transform(test_data['class'])

# Calculate accuracy and print classification report
accuracy = accuracy_score(test_labels_encoded, final_predictions)
print(f"Ensemble Model Accuracy: {accuracy}")

# Print the classification report
class_names = label_encoder.classes_
print(classification_report(test_labels_encoded, final_predictions, target_names=class_names))



Ensemble Model Accuracy: 0.65
              precision    recall  f1-score   support

          CL       0.51      0.95      0.67        20
          CR       0.60      0.60      0.60        20
          DC       0.59      0.65      0.62        20
          DS       0.82      0.70      0.76        20
          LO       0.81      0.65      0.72        20
          NI       0.63      0.60      0.62        20
          SE       0.89      0.40      0.55        20

    accuracy                           0.65       140
   macro avg       0.69      0.65      0.65       140
weighted avg       0.69      0.65      0.65       140



In [14]:
class_mapping = {
    "CL":"Computation and Language",
    "CR": "Cryptography and Security",
    "DC": "Distributed and Cluster Computing",
    "DS": "Data Structures and Algorithms",
    "LO": "Logic in Computer Science",
    "NI": "Networking and Internet Architecture",
    "SE": "Software Engineer"
}


In [21]:
def main():
    text_input = input()

    roberta_test_inputs = tokenize_text([text_input], roberta_tokenizer, max_length)
    with torch.no_grad():
        roberta_test_outputs = roberta_model(**roberta_test_inputs.to(device), output_hidden_states=True)[0].cpu().detach().numpy()

    tfidf_input = tfidf_vectorizer.transform([text_input])
    lr_pred = lr_model.predict(tfidf_input)

    # Get LDA topics for the input
    lda_topic = lda_model.transform(tfidf_input)

    # Combine test features for the ensemble model
    combined_features_test = np.concatenate((roberta_test_outputs, lda_topic), axis=1)

    # Make predictions using the ensemble model
    lr_combined_pred = lr_combined_model.predict(combined_features_test)

    # Use the majority_vote function
    final_prediction = majority_vote([lr_pred, lr_combined_pred], label_encoder)

    # Decode the predicted class label
    predicted_label = label_encoder.inverse_transform([final_prediction])[0]
    # Map predicted class abbreviation to full context
    full_label = class_mapping.get(predicted_label, "Unknown")

    print("Predicted Class:", full_label)

In [22]:
if __name__ == "__main__":
    main()

Predicted Class: Computation and Language


  y = column_or_1d(y, warn=True)
