In [1]:
import numpy as np
import pandas as pd
from nltk.stem.porter import *

In [2]:
df = pd.read_csv(
    'DataFromFirstEmail/20190118_reading_specialists.csv'
)
obs_full = list(df.OBS)
obs_full = [sent.replace('word by word', 'word-by-word') for sent in obs_full]
obs_short = [[y.strip() for y in x.lower().replace('$obs:', '').strip().split(';')] for x in obs_full]
corpus = [x.replace('.', '').replace(',', '').replace('/', '') for y in obs_short for x in y if x]

In [3]:
ps = PorterStemmer()

def filter_(req_words, corpus):
    for req_words_set in req_words:
        assert type(req_words_set) == type(set())
    stemmed_req_words = [{ps.stem(wd) for wd in req_words_set} for req_words_set in req_words]
    
    caught = []
    not_caught = []
    for line in corpus:
        stemmed = set([ps.stem(x) for x in line.strip().split()])
        distinct = False
        for stemmed_set in stemmed_req_words:
            if stemmed_set & stemmed == set():
                distinct = True
        if not distinct:
            caught.append(line)
        else:
            not_caught.append(line)
    return caught, not_caught
            

In [4]:
difficulty_text, leftover = filter_([
    {'text', 'passage'},
    {'easy', 'challenging', 'difficult', 'level', 'hard'}
], corpus)

accuracy_text, leftover = filter_([
    {'accurate', 'accuracy', 'miscue', 'misread'}
], leftover)

fluency_text, leftover = filter_([
    {'fluency', 'fluent', 'fluently', 'disfluent', 'intonation', 'decode', 'word-by-word', 'choppy'}
], leftover)

pronunciation_text, leftover = filter_([
    {'pronounce', 'pronunciation', 'enunciate', 'mispronounce', 'mispronunciation', 'attack'}
], leftover)

phrasing_text, leftover = filter_([
    {'phrasing'}
], leftover)

meaning_text, leftover = filter_([
    {'understanding', 'meaning', 'comprehension'}
], leftover)

speed_text, leftover = filter_([
    {'fast', 'quick', 'slow', 'slowly', 'rush', 'rate', 'quickly', 'pace'}
], leftover)

subst_omit_text, leftover = filter_([
    {'substitution', 'omission', 'omit', 'insert'}
], leftover)

punctuation_text, leftover = filter_([
    {'punctuation'}
], leftover)

self_correct_text, leftover = filter_([
    {'self-correct', 'correct', 'self'}
], leftover)

expression_text, leftover = filter_([
    {'expression', 'monotone'}
], leftover)

volume_text, leftover = filter_([
    {'quiet', 'volume'}
], leftover)

vocab_text, leftover = filter_([
    {'vocabulary'}
], leftover)

qualitative_text, leftover = filter_([
    {'good', 'solid', 'strong'}
], leftover)

In [5]:
len(leftover)

192

In [6]:
leftover

['student pausing for new sentences',
 'student changing pitch when reading sentences ending in a question mark',
 'student placing emphasis on certain words',
 'stumbled on syntax in final sentence',
 'student repeating certain words',
 "student pausing to correctly sound out new words 'preservation'",
 'struggled with syntax in last sentence',
 'student changing pitch when reading sentences ending in a question mark',
 "student adding -s to word that isn't plural 'lead'",
 'able to take apart multisyllabic words on the fly',
 'cross-checks efficiently',
 'leaves off -er ending on angler',
 'difficulty with the prefix un- in unusual',
 'able to take apart multisyllabic words in text',
 'noticing names of people',
 'some evidence of taking apart longer words to solve them (hydroponic)',
 'reread part of a sentence',
 'attempts initial sounds but is not able to get medial sounds',
 'still appears to be at a short vowel sound level (cvc words)',
 'has some high frequencysight words',
 'n

### from before, borrowed code

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
def preprocess(line):
    return ' '.join([ps.stem(x) for x in line.strip().split()])
tfidf_vectorizer = TfidfVectorizer(preprocessor=preprocess)
tfidf = tfidf_vectorizer.fit_transform(leftover)
kmeans = KMeans(n_clusters=10).fit(tfidf)
line_to_cluster = {line: kmeans.predict(tfidf_vectorizer.transform([line]))[0] for line in leftover}
# for center in set(line_to_cluster.values()):
#     print(center)
#     lines_with_center = [x for x in line_to_cluster if line_to_cluster[x] == center]
#     for line in lines_with_center:
#         print('  -', line)