# Training Distilbert-base-uncased on reddit anxiety dataset

In [None]:
MODEL = 'distilbert-base-uncased'

In [2]:
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer
import pandas as pd
import numpy as np
class AnxietyDataset(Dataset): #Train and test Dataset
    def __init__(self,csv):
        self.csv = np.asarray(csv)
        self.tokenizer = tokenizer = AutoTokenizer.from_pretrained(MODEL)

    def __getitem__(self, idx):
        return {
            'input_ids':self.tokenizer(self.csv[idx][0],return_tensors='pt',padding="max_length", truncation=True).input_ids[0],
            'labels':torch.tensor(self.csv[idx][1])
        }

    def __len__(self):
        return self.csv.shape[0]

# Splitting Data into Test and Train

In [3]:
from sklearn.model_selection import train_test_split
ds = pd.read_csv('../input/reddit-anxiety/reddit_anxiety.csv')
train,test = train_test_split(ds,stratify=ds['label'])
train_dataset = AnxietyDataset(train)
test_dataset = AnxietyDataset(test)

# EDA
> WordCloud gives a glimpse of usage of words in the dataset

> Distribution of training labels plotted as a histogram

In [4]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud,STOPWORDS

def color_wc(word=None,font_size=None,position=None, orientation=None,font_path=None, random_state=42):
    h = int(360.0 * float(random_state.randint(40, 150)) / 255.0)
    s = int(100.0 * float(random_state.randint(40, 150)) / 255.0)
    l = int(100.0 * float(random_state.randint(40, 150)) / 255.0)
    return "hsl({}, {}%, {}%)".format(h, s, l)

plt.subplots(figsize=(16,16))
wc = WordCloud(stopwords=STOPWORDS,background_color="gray", contour_width=2, contour_color='blue',width=1500, height=750,color_func=color_wc,max_words=150, max_font_size=256,random_state=42)
wc.generate(' '.join(train['text']) + ' '.join(test['text']))
plt.imshow(wc, interpolation="bilinear")
plt.axis('off')
plt.show()

In [16]:
plt.hist(ds['label'])

In [5]:
import os
os.environ["WANDB_DISABLED"] = "true"

# Using Huggingface Trainer to train this model on the given dataset

In [6]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=1,              
    per_device_train_batch_size=1,  
)

model = AutoModelForSequenceClassification.from_pretrained(MODEL,num_labels=1)

trainer = Trainer(
    model=model,                         
    args=training_args,                 
    train_dataset=train_dataset,        
    eval_dataset=test_dataset         
)

In [7]:
trainer.train()

In [8]:
trainer.evaluate()

In [9]:
valid_csv = pd.read_csv('../input/reddit-anxiety/valid.csv')

# Predicting the valid dataset

In [10]:
class AnxietyValidDataset(AnxietyDataset):
    def __getitem__(self, idx):
        return {
            'input_ids':self.tokenizer(self.csv[idx][0],return_tensors='pt',padding="max_length", truncation=True).input_ids[0]
        }

In [11]:
raw_pred,_,_ = trainer.predict(AnxietyValidDataset(valid_csv))

In [12]:
valid_csv['label'] = raw_pred

In [17]:
valid_csv.to_csv('valid.csv',index=False)