In [None]:
! pip install transformers

In [None]:
! pip install datasets

In [None]:
!pip install --upgrade --force-reinstall --no-deps kaggle

In [8]:
import contextlib
from google.colab import files
import io

with contextlib.redirect_stdout(io.StringIO()):
    files.upload()

In [None]:
! mkdir ~/.kaggle

! cp kaggle.json ~/.kaggle/

! chmod 600 ~/.kaggle/kaggle.json

! kaggle competitions download -c feedback-prize-effectiveness

! unzip /content/feedback-prize-effectiveness.zip -d data

### Data Preparation

Taking most of the code from base model code

In [10]:
import pandas as pd
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer

In [11]:
df_train = pd.read_csv('/content/data/train.csv')
df_test = pd.read_csv('/content/data/test.csv')

In [12]:
df_train['raw_text'] = df_train['discourse_type'] + " " + df_train['discourse_text']
df_test['raw_text'] = df_test['discourse_type'] + " " + df_test['discourse_text']

### Data Cleaning

1. Lower entire corpus
2. Remove Stopwords
3. Remove punctuations

In [13]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
all_stopwords = stopwords.words("english")

def clean_text(x):
  # Split by space
  tokens = x.split(' ')
  # Lower case the tokens
  lowered = [i.lower() for i in tokens]
  # Remove the tokens with less than 4 characters
  longer = [i for i in lowered if len(i) > 3]
  # Remove Stop words
  no_stopwords = [i for i in longer if i not in all_stopwords]

  return ' '.join(no_stopwords)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [14]:
df_train['text'] = df_train['raw_text'].apply(lambda x : clean_text(x))
df_test['text'] = df_test['raw_text'].apply(lambda x : clean_text(x))

Label Encoding

In [15]:
le = LabelEncoder()

le.fit(df_train['discourse_effectiveness'])
df_train['label'] = le.transform(df_train['discourse_effectiveness'])

Train Validation Split

In [16]:
from sklearn.model_selection import train_test_split
val_size = 0.2

df_train_sub = df_train[['text', 'label']]

train_df, val_df = train_test_split(df_train_sub, test_size=val_size, random_state=42)

Convert Pandas Dataframe to Arrow

In [17]:
train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)

Tokenize the Datasets

In [18]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_train = train_ds.map(preprocess_function, batched=True)
tokenized_val = val_ds.map(preprocess_function, batched=True)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]



  0%|          | 0/30 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

Use DataCollatorWithPadding to create a batch of examples. It will also dynamically pad your text to the length of the longest element in its batch, so they are a uniform length.

In [19]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Initialize Model

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

### Train the model

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

In [None]:
test_ds = Dataset.from_pandas(df_test[['text']])
tokenized_test = test_ds.map(preprocess_function, batched=True)

In [None]:
preds = trainer.predict(tokenized_test)

In [None]:
import tensorflow as tf

prediction_probabilities = tf.nn.softmax(preds[0])

In [None]:
model.config

In [None]:
# df_test[[0,1,2]] = prediction_probabilities
# df_test.head()

submission_df = df_test.loc[:, ['discourse_id']]

submission_df.loc[: , [0,1,2]] = prediction_probabilities.numpy()
submission_df.head()

In [None]:
# Convert Label IDs to Label Names from the Label Encoder
le_name_mapping = dict(zip(le.transform(le.classes_), le.classes_))
print(le_name_mapping)

In [None]:
# Using the ID to Name mapping, rename columns
submission_df.rename(columns = le_name_mapping, inplace = True)

In [None]:
submission_df.to_csv('submission.csv', index = False)