In [1]:
!pip install transformers
!pip install sentencepiece
!pip install protobuf

Collecting transformers
  Obtaining dependency information for transformers from https://files.pythonhosted.org/packages/1a/06/3817f9bb923437ead9a794f0ac0d03b8b5e0478ab112db4c413dd37c09da/transformers-4.33.2-py3-none-any.whl.metadata
  Using cached transformers-4.33.2-py3-none-any.whl.metadata (119 kB)
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Obtaining dependency information for huggingface-hub<1.0,>=0.15.1 from https://files.pythonhosted.org/packages/72/21/51cddb8850ed3f4dbc21e57c3dabc49e64d5577857ddda7b2eb0ffc2ec0e/huggingface_hub-0.17.2-py3-none-any.whl.metadata
  Using cached huggingface_hub-0.17.2-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers)
  Obtaining dependency information for regex!=2019.12.17 from https://files.pythonhosted.org/packages/03/5e/9a4cabe86a3b4e67bd2cf795a2e84de01c735c8c1c1d88795425847ccbbe/regex-2023.8.8-cp311-cp311-macosx_11_0_arm64.whl.metadata
  Downloading regex-2023.8.8-cp311-cp311-macosx_11_0_arm64.

In [2]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from transformers import pipeline
import sentencepiece
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch


In [3]:
# Load the dataset
file_path = './data/news.csv'  # Replace with the actual path to your file
df = pd.read_csv(file_path)

In [4]:
df.tail()

Unnamed: 0,id,title,contents
59995,NEWS_59995,"Dolphins Break Through, Rip Rams For First Win",But that #39;s OK. Because after a 31-14 rout ...
59996,NEWS_59996,"After Steep Drop, Price of Oil Rises",The freefall in oil prices ended Monday on a s...
59997,NEWS_59997,Pro football: Culpepper puts on a show,To say Daunte Culpepper was a little frustrate...
59998,NEWS_59998,Albertsons on the Rebound,The No. 2 grocer reports double-digit gains in...
59999,NEWS_59999,Cassini Craft Spies Saturn Moon Dione (AP),AP - The Cassini spacecraft's close flyby of S...


In [5]:
# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/syshin/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /Users/syshin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/syshin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
# Define a function to preprocess text data
def preprocess_text(title, text):
    # Check for NaN values and replace them with empty strings
    title = '' if pd.isna(title) else title
    text = '' if pd.isna(text) else text

    # Concatenate title and text
    full_text = title + " " + text

    # Check if the full_text is just a URL (or multiple URLs)
    if re.match(r'^\s*http[s]?://\S+\s*$', full_text):
        return ''

    # Remove any special characters and digits
    full_text = re.sub(r'[^a-zA-Z\s]', '', full_text, re.I | re.A)

    # Convert to lowercase
    full_text = full_text.lower()

    # Tokenize the text
    tokens = nltk.word_tokenize(full_text)

    # Remove stopwords
    filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]

    # Lemmatize the tokens
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    # Join the tokens back to a string
    preprocessed_text = ' '.join(lemmatized_tokens)

    return preprocessed_text

In [7]:
# Preprocess the 'title' and 'text' columns in the dataframe
df['preprocessed_text'] = df.apply(lambda row: preprocess_text(row['title'], row['contents']), axis=1)

In [10]:
# Load the model and tokenizer for each model
model_name1 = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"
tokenizer1 = AutoTokenizer.from_pretrained(model_name1)
model1 = AutoModelForSequenceClassification.from_pretrained(model_name1).to('mps')

model_name2 = "facebook/bart-large-mnli"
tokenizer2 = AutoTokenizer.from_pretrained(model_name2)
model2 = AutoModelForSequenceClassification.from_pretrained(model_name2).to('mps')

model_name3 = "MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli"
tokenizer3 = AutoTokenizer.from_pretrained(model_name3)
model3 = AutoModelForSequenceClassification.from_pretrained(model_name3).to('mps')



Downloading (…)lve/main/config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/395 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/8.65M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/18.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/870M [00:00<?, ?B/s]

In [11]:
# Define candidate labels
candidate_labels = ['Business', 'Entertainment', 'Politics', 'Sports', 'Tech', 'World']

In [12]:
# Define a mapping from category index to label
category_mapping = {
    0: 'Business',
    1: 'Entertainment',
    2: 'Politics',
    3: 'Sports',
    4: 'Tech',
    5: 'World'
}

In [15]:
# Define a function to classify text
def classify_text(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    inputs = {key: value.to('mps') for key, value in inputs.items()}  # Move input tensors to GPU
    outputs = model(**inputs)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    category_index = torch.argmax(predictions).item()
    return category_mapping[category_index]

In [None]:
# Classify the news articles and map the predictions to the category labels for all models
df['categories_model1'] = df['preprocessed_text'].apply(lambda x: classify_text(x, model1, tokenizer1))


In [None]:
df['categories_model1'] = df['categories_model1'].map({v: k for k, v in category_mapping.items()})

submission1 = df[['id', 'categories_model1']]

submission1.columns = ['id', 'category']
submission1.to_csv('./submissions/submission_DeBERTa-v3-base-mnli-fever-anli.csv', index=False)

In [None]:
df['categories_model2'] = df['preprocessed_text'].apply(lambda x: classify_text(x, model2, tokenizer2))


In [None]:
df['categories_model2'] = df['categories_model2'].map({v: k for k, v in category_mapping.items()})

submission2 = df[['id', 'categories_model2']]
submission2.columns = ['id', 'category']
submission2.to_csv('./submissions/submission_bart-large-mnli.csv', index=False)

In [None]:
df['categories_model3'] = df['preprocessed_text'].apply(lambda x: classify_text(x, model3, tokenizer3))


In [None]:

submission3 = df[['id', 'categories_model3']]
submission3.columns = ['id', 'category']
submission3.to_csv('./submissions/submission_DeBERTa-v3-large-mnli-fever-anli-ling-wanli.csv', index=False)