### first pass, discretizing free text annotations

In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv(
    'DataFromFirstEmail/20190118_reading_specialists.csv'
)
obs_full = list(df.OBS)
obs_short = [[y.strip() for y in x.lower().replace('$obs:', '').strip().split(';')] for x in obs_full]
corpus = [x for y in obs_short for x in y if x]

In [3]:
obs_short = [[y.strip() for y in x.lower().replace('$obs:', '').strip().split(';')] for x in obs_full]

In [4]:
corpus = [x for y in obs_short for x in y if x]
corpus[500]

'rereads to self-correct'

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [6]:
from nltk.stem.porter import *
ps = PorterStemmer()
def preprocess(line):
    return ' '.join([ps.stem(x) for x in line.strip().split()])
tfidf_vectorizer = TfidfVectorizer(preprocessor=preprocess)
tfidf = tfidf_vectorizer.fit_transform(corpus)

In [7]:
kmeans = KMeans(n_clusters=5).fit(tfidf)

In [8]:
line_to_cluster = {line: kmeans.predict(tfidf_vectorizer.transform([line]))[0] for line in corpus}

In [9]:
for center in set(line_to_cluster.values()):
    lines_with_center = [x for x in line_to_cluster if line_to_cluster[x] == center]
    for line in lines_with_center:
#         print('  -', line)
        pass

In [10]:
def filter_(terms):
    rolling = corpus
    rolling_len = len(corpus)
    for term in terms:
        rolling = [x for x in rolling if term not in x]
        new_len = len(rolling)
        print(term, 'filtered out', rolling_len - new_len, '->', new_len, 'left')
        rolling_len = new_len
    for ln in rolling:
        print(ln)

In [11]:
filter_(
    [
        'expressi',
        'fluen',
        'phras',
        'word by',
        'word-by',
        'intona',
        'self-correct',
        'self correct',
        'accura',
        'punctua',
        'meaning',
        'substitut',
        'skip',
        'miscu',
        'vocabul',
        'quiet',
        'slow',
        'decod',
        'omit',
        'omission',
        'pronounc',
        'pronunc',
        'attack',
        'mis',
        'monotone'
    ]
)

expressi filtered out 100 -> 853 left
fluen filtered out 29 -> 824 left
phras filtered out 69 -> 755 left
word by filtered out 24 -> 731 left
word-by filtered out 2 -> 729 left
intona filtered out 11 -> 718 left
self-correct filtered out 73 -> 645 left
self correct filtered out 4 -> 641 left
accura filtered out 18 -> 623 left
punctua filtered out 79 -> 544 left
meaning filtered out 117 -> 427 left
substitut filtered out 28 -> 399 left
skip filtered out 5 -> 394 left
miscu filtered out 12 -> 382 left
vocabul filtered out 15 -> 367 left
quiet filtered out 7 -> 360 left
slow filtered out 8 -> 352 left
decod filtered out 26 -> 326 left
omit filtered out 14 -> 312 left
omission filtered out 1 -> 311 left
pronounc filtered out 3 -> 308 left
pronunc filtered out 1 -> 307 left
attack filtered out 10 -> 297 left
mis filtered out 10 -> 287 left
monotone filtered out 7 -> 280 left
passage seems to be at independent reading level
passage seems slightly easy for this student, may need a higher leve

### testing lda code (from priva-dwivedi)

In [12]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(400)

In [13]:
stemmer = SnowballStemmer("english")

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
            
    return result

In [14]:
processed_docs = []

for doc in corpus:
    processed_docs.append(preprocess(doc))

In [15]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [16]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 independ
1 level
2 passag
3 read
4 accur
5 adequ
6 express
7 speed
8 student
9 usual
10 attend


In [18]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [19]:
document_num = 1
bow_doc_x = bow_corpus[document_num]

# for i in range(len(bow_doc_x)):
#     print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0], 
#                                                      dictionary[bow_doc_x[i][0]], 
#                                                      bow_doc_x[i][1]))

In [20]:
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 8, 
                                   id2word = dictionary,                                    
                                   passes = 10,
                                   workers = 2)

In [21]:
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.165*"student" + 0.110*"read" + 0.049*"paus" + 0.041*"sentenc" + 0.033*"quick" + 0.021*"text" + 0.020*"check" + 0.019*"word" + 0.019*"inform" + 0.017*"inconsist"


Topic: 1 
Words: 0.114*"word" + 0.084*"sight" + 0.075*"mean" + 0.053*"high" + 0.047*"frequenc" + 0.035*"maintain" + 0.030*"monoton" + 0.027*"reread" + 0.027*"read" + 0.024*"visual"


Topic: 2 
Words: 0.177*"express" + 0.118*"read" + 0.076*"good" + 0.040*"limit" + 0.039*"littl" + 0.039*"inton" + 0.037*"accur" + 0.035*"reader" + 0.023*"rate" + 0.019*"adequ"


Topic: 3 
Words: 0.260*"word" + 0.056*"read" + 0.047*"substitut" + 0.031*"student" + 0.030*"insert" + 0.028*"evid" + 0.028*"multisyllab" + 0.028*"mean" + 0.020*"apart" + 0.018*"interfer"


Topic: 4 
Words: 0.147*"word" + 0.064*"sound" + 0.049*"decod" + 0.046*"end" + 0.042*"omit" + 0.030*"difficulti" + 0.023*"vowel" + 0.021*"initi" + 0.020*"attend" + 0.017*"begin"


Topic: 5 
Words: 0.167*"phrase" + 0.113*"read" + 0.077*"word" + 0.054*"long" + 0.046*"shor