In [1]:
from fastai.text import *
import pandas as pd
from path import Path as path
import re
import random
import gc

In [6]:
reg = "(intp|intj|infp|infj|istj|istp|isfj|isfp|entj|entp|enfj|enfp|estp|estj|esfp|esfj)"

In [15]:
def augment(original, n = 20000):
    if len(original) > n:
        return original.sample(n=n)
    texts = original['text']
    texts2 = [re.sub(reg,"",t, flags=re.I) for t in texts]
    original['text'] = texts2
    n0 = len(texts)
    n1 = n - n0
    df = pd.DataFrame()
    addtexts = []
    for i in range(0, n1):
        segs = random.randint(2,10)
        samples = texts.sample(n=segs)
        newtext = "\n".join(list(samples))
        addtexts.append(newtext)
    df['subreddit'] = [original['subreddit'][0]]*n1
    df['text'] = addtexts
    return original.append(df)

In [16]:
test = pd.read_csv("intp.csv", index_col = 0).dropna()

In [17]:
g = augment(test)

In [19]:
g.tail()

Unnamed: 0,subreddit,text
72424,intp,I don't relate to any of the stereotypes you l...
72425,intp,"One time, in sophomore year history class, I m..."
72426,intp,Well I keep getting offended over things I rea...
72427,intp,People always said to me my voice makes them s...
72428,intp,Don't forget that you can outsource that part ...


In [16]:
path2 = path("model/").mkdir_p()
df = pd.DataFrame()
for f in path("./").files():
    if f.ext == ".csv":
        tmp = pd.read_csv(f,index_col = 0).dropna()
        df = df.append(augment(tmp))
        gc.collect()

In [17]:
print(len(df))

320000


In [18]:
df.tail()

Unnamed: 0,subreddit,text
23275,intp,"You are correct, I did insert some of my moral..."
11452,intp,It doesn’t only joke about the definitions but...
15806,intp,"I'm not sure how hypothetical that example is,..."
16514,intp,it does allow for the person to pay attention ...
22609,intp,why'd you edit your comment? Decided I was cut...


In [19]:
df.to_csv("model/mbti.csv", index=None)

In [2]:
gc.collect()
path2 = path("model/").mkdir_p()
# Language model data
data_lm = TextLMDataBunch.from_csv(path2, 'mbti.csv')
# Classifier model data
data_clas = TextClasDataBunch.from_csv(path2, 'mbti.csv', vocab=data_lm.train_ds.vocab, bs=32)

In [3]:
gc.collect()
data_lm.save('data_lm_export.pkl')
data_clas.save('data_clas_export.pkl')

In [4]:
gc.collect()
data_lm = load_data(path2, 'data_lm_export.pkl')
data_clas = load_data(path2, 'data_clas_export.pkl', bs=16)

In [5]:
gc.collect()
learn = language_model_learner(data_lm, AWD_LSTM, drop_mult=0.5)

In [6]:
gc.collect()

learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,3.91741,3.645595,0.299471,2:17:46


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [7]:
learn.unfreeze()

In [8]:

learn.fit_one_cycle(1, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,2.951338,2.647444,0.476425,2:33:20


In [9]:
learn.predict("This is a review about", n_words=10)

'This is a review about people wanting to justify their behaviour when they suppose .'

In [10]:
learn.save_encoder('ft_enc')

In [11]:
learn = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.5)
learn.load_encoder('ft_enc')

In [12]:
data_clas.show_batch()

text,target
"xxbos xxmaj alright . xxmaj this sub seems a little less active than the other xxup mbti subs so i 'm crossing my fingers that i get some responses . xxmaj for context i 'm pr xxrep 4 o bably an or an . \n i 've known this since high school , but we had n't really seen each other since 2007 . xxmaj every few years he",estp
"xxbos xxmaj older male here , this is funny as my parents are also exfp and . xxmaj first of all his family probably has another source of income from something they are n't telling you about . i would ask your boyfriend about that at some point if i were you , although not necessarily right away , they do n't have to tell you the details of their",enfj
xxbos xxmaj hey there ! i m so happy you posted this . i m and my sister is and i came to this subreddit for the same reason you posted this . xxmaj my sister tends to be extremely judgmental and lash out at me in front of other people . xxmaj it has caused a serious strain on our relationship . xxmaj she embarrasses me and since i,esfj
"xxbos i 'm a xxmaj christian myself , and i heard a pastor give a sermon once that i think could help you . xxmaj basically he said that he does n't believe there is anyone on this xxmaj earth who honestly has faith in nothing . xxmaj some people have faith in xxmaj god , sure , but there are hundreds of other things to have faith in --",infp
"xxbos xxmaj currently reading xxmaj ernest xxmaj hemingway quotes as he ’s an . & i ’m thinking about how true his quote is “ xxmaj when people talk , listen completely . xxmaj most people never listen . ” xxmaj when i ’m speaking with someone they have my full attention & i notice when i speak to others and sometimes even when responding to that person they can",estp


In [13]:
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,1.977255,1.859281,0.342479,3:12:28


In [14]:
learn.freeze_to(-2)
learn.fit_one_cycle(1, slice(5e-3/2., 5e-3))

epoch,train_loss,valid_loss,accuracy,time
0,1.713701,1.489366,0.490492,2:57:37


In [15]:
learn.unfreeze()
learn.fit_one_cycle(1, slice(2e-3/100, 2e-3))

epoch,train_loss,valid_loss,accuracy,time
0,1.481661,1.307122,0.554694,5:17:58


In [16]:
learn.export("model")

In [17]:
learn.save("save1")

In [18]:
learn.fit_one_cycle(1, slice(2e-4/100, 2e-4))

epoch,train_loss,valid_loss,accuracy,time
0,1.352522,1.293065,0.562882,5:19:39


In [19]:
learn.export("model")