https://docs.fast.ai/text.html

In [1]:
from fastai import *
from fastai.text import *

In [3]:
import fastai; fastai.__version__

'1.0.22'

In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pathlib import Path

In [18]:
path = Path('/data/quora-insincere-questions-classification')

In [5]:
df0 = pd.read_csv(path/'train.csv')

In [60]:
df = df0.sample(50000)


In [61]:
df.shape

(50000, 3)

In [62]:
df.head()

Unnamed: 0,qid,question_text,target
8948,01bf7cb32fd37aa36496,How have so many people got depression nowaday...,1
1037001,cb31fdecd7e329304c2f,How can I block a person who have already bloc...,0
560443,6dcad087523db5f2629f,How can I start a hair supply business online?,0
269176,34b160d79da820e1d1b3,If you could be friends with anybody from Mort...,0
1238684,f2bf48f78de0995d4270,Should I have an inner dialogue when I read?,0


## prepare data for fastai

In [63]:
df.drop(columns=['qid'], inplace=True)

df['target'] = df.target.astype(bool)

df['is_valid'] = np.random.choice([True, False], size=len(df), p=[0.2, 0.8])

df = df[['target', 'question_text', 'is_valid']]

df.columns = ['label', 'text', 'is_valid']

In [64]:
df.head()

Unnamed: 0,label,text,is_valid
8948,True,How have so many people got depression nowaday...,False
1037001,False,How can I block a person who have already bloc...,False
560443,False,How can I start a hair supply business online?,True
269176,False,If you could be friends with anybody from Mort...,False
1238684,False,Should I have an inner dialogue when I read?,False


In [65]:
print("\n\n".join(df[df.label].text.sample(5).values))

Why is Quora moderation so corrupt?

Why do Muslims say nonsense about Modi, can't they see changes he is bringing ?

Is it worthwhile for the USA to have it's cities under the menace to be nuked, in exchange for it's intervenience in other countries?

Why do you think its ok for Sarah Sanders to be kicked out of a restaurant? Does political discrimination exist in your opinion?

Is there a time when someone has argued against flat earth while also actually understanding the flat earth followers beliefs ie: flat earth would mean if you walk away from the center youd be diagonal vs earth accelerating upwards for gravity(F.E)?


In [66]:
print(f"{df.label.mean()*100:.2f}% questions are insincere.")

6.24% questions are insincere.


In [67]:
df.to_csv(path/f"fastai_train.csv", index=False)

To get a DataBunch quickly, there are also several factory methods depending on how our data is structured. They are all detailed in text.data, here we'll use the method from_csv of the TextLMDataBunch (to get the data ready for a language model) and TextClasDataBunch (to get the data ready for a text classifier) classes.

In [68]:
# Language model data
data_lm = TextLMDataBunch.from_csv(path, path/f"fastai_train.csv")
# Classifier model data
data_clas = TextClasDataBunch.from_csv(path, path/f"fastai_train.csv", vocab=data_lm.train_ds.vocab, bs=32)

In [69]:
data_lm.save()
data_clas.save()

In [70]:
data_lm = TextLMDataBunch.load(path)
data_clas = TextClasDataBunch.load(path, bs=32)
# Note that you can load the data with different DataBunch parameters (batch size, bptt,...)

In [None]:
# Fine-tuning a language model
# We can use the data_lm object we created earlier to fine-tune a pretrained language model. fast.ai has an English model available that we can download. We can create a learner object that will directly create a model, download the pretrained weights and be ready for fine-tuning.

learn = language_model_learner(data_lm, pretrained_model=URLs.WT103, drop_mult=0.5)

In [None]:
learn.lr_findind()

In [71]:
learn.fit_one_cycle(1, 1e-2)

Total time: 00:13
epoch  train_loss  valid_loss  accuracy
1      4.385365    3.946091    0.341313  (00:13)



In [72]:
# Like a computer vision model, we can then unfreeze the model and fine-tune it.

learn.unfreeze()
learn.fit_one_cycle(1, 1e-3)

Total time: 00:15
epoch  train_loss  valid_loss  accuracy
1      3.896597    3.700320    0.361663  (00:15)



In [73]:
# And finally we save the encoder to be able to use it for classification in the next section.

learn.save_encoder('ft_enc')
# Building a classifier
# We now use the data_clas object we created earlier to build a classifier with our fine-tuned encoder. The learner object can be done in a single line.

In [92]:
learn = text_classifier_learner(data_clas, drop_mult=0.5)
learn.load_encoder('ft_enc')
learn.fit_one_cycle(1, 1e-6)

Total time: 00:24
epoch  train_loss  valid_loss  accuracy
1      0.728509    0.693286    0.522348  (00:24)



In [93]:
learn.freeze_to(-2)
learn.fit_one_cycle(1, slice(5e-7/2., 5e-7))

Total time: 00:30
epoch  train_loss  valid_loss  accuracy
1      0.725627    0.669718    0.589041  (00:30)



In [94]:
learn.unfreeze()
learn.fit_one_cycle(1, slice(2e-7/100, 2e-7))

Total time: 00:58
epoch  train_loss  valid_loss  accuracy
1      0.719857    0.660441    0.619838  (00:58)



In [98]:
preds,y=learn.get_preds()

In [99]:
from sklearn.metrics import f1_score

In [105]:
y[:10]

tensor([1, 1, 0, 1, 1, 1, 1, 1, 1, 1])

In [None]:
len()

In [106]:
f1_score(y, preds[:, 0]>0.5)

0.5240690978886756

In [107]:
f1_score(y, preds[:, 0]>0.2)

0.9662514858649026

In [108]:
f1_score(y, preds[:, 0]>0.01)

0.9662514858649026

In [109]:
f1_score(y, preds[:, 1]>0.01)

0.9662514858649026

tensor(9348)