# Letter prediction for English words

### What's the next letter, given 1, 2, 3, etc preceding letters?

In [1]:
from nltk.corpus import brown
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd
import string

In [2]:
sample_size = 10000 
words = list(set([w.lower() for w in brown.words()[:sample_size]]))
words = [word for word in words if word.isalpha()]

In [3]:
# DataFrame width: corresponds to the longest word
df_width = len(sorted(words, key=len)[-1])

In [4]:
# Process words into "bigrams" -- in this case letter pairs
def process_word(word, n):
    letters = list(word)
    d = []
    for i in range(len(letters) - 1):
        foo = tuple([word] + [ord('_') for n in range(n-len(word[:i+1]))]
                           + [ord(l) for l in letters[:i+1]]
                           + [ord(letters[i+1])])
        d.append(foo)
    return d       

In [5]:
process_word('keeping', 10)

[('keeping', 95, 95, 95, 95, 95, 95, 95, 95, 95, 107, 101),
 ('keeping', 95, 95, 95, 95, 95, 95, 95, 95, 107, 101, 101),
 ('keeping', 95, 95, 95, 95, 95, 95, 95, 107, 101, 101, 112),
 ('keeping', 95, 95, 95, 95, 95, 95, 107, 101, 101, 112, 105),
 ('keeping', 95, 95, 95, 95, 95, 107, 101, 101, 112, 105, 110),
 ('keeping', 95, 95, 95, 95, 107, 101, 101, 112, 105, 110, 103)]

In [6]:
all_words = []
for word in words:
    all_words.extend(process_word(word, df_width))
cols = ['source']+['feature'+str(n) for n in range(df_width)]+['target']
print(all_words[0])
print(cols)
df = pd.DataFrame(all_words, columns=cols)

('improving', 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 105, 109)
['source', 'feature0', 'feature1', 'feature2', 'feature3', 'feature4', 'feature5', 'feature6', 'feature7', 'feature8', 'feature9', 'feature10', 'feature11', 'feature12', 'feature13', 'feature14', 'feature15', 'target']


In [7]:
df.head()

Unnamed: 0,source,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,feature11,feature12,feature13,feature14,feature15,target
0,improving,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,105,109
1,improving,95,95,95,95,95,95,95,95,95,95,95,95,95,95,105,109,112
2,improving,95,95,95,95,95,95,95,95,95,95,95,95,95,105,109,112,114
3,improving,95,95,95,95,95,95,95,95,95,95,95,95,105,109,112,114,111
4,improving,95,95,95,95,95,95,95,95,95,95,95,105,109,112,114,111,118


In [8]:
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75 

In [9]:
train, test = df[df['is_train']==True], df[df['is_train']==False]

In [10]:
features = df.columns[1:-3]
clf = RandomForestClassifier(n_jobs=2)
y = train['target'] # Instead of y, _ = pd.factorize(train[1])

In [11]:
features

Index(['feature0', 'feature1', 'feature2', 'feature3', 'feature4', 'feature5',
       'feature6', 'feature7', 'feature8', 'feature9', 'feature10',
       'feature11', 'feature12', 'feature13', 'feature14'],
      dtype='object')

In [12]:
clf.fit(train[features], y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [13]:
print(clf.predict(test[features]))

[101 115 101 ..., 111 110 105]


In [14]:
test.head()

Unnamed: 0,source,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,feature11,feature12,feature13,feature14,feature15,target,is_train
0,improving,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,105,109,False
1,improving,95,95,95,95,95,95,95,95,95,95,95,95,95,95,105,109,112,False
4,improving,95,95,95,95,95,95,95,95,95,95,95,105,109,112,114,111,118,False
5,improving,95,95,95,95,95,95,95,95,95,95,105,109,112,114,111,118,105,False
9,almost,95,95,95,95,95,95,95,95,95,95,95,95,95,95,97,108,109,False


In [15]:
preds = clf.predict(test[features])
preds[:20]

array([101, 115, 101, 101, 116, 110, 111, 115, 114, 108, 100, 110, 114,
       101, 101, 107, 105, 111, 105, 110])

In [16]:
ct = pd.crosstab(test['target'], preds)
ct

col_0,97,98,99,100,101,102,103,104,105,107,...,111,112,114,115,116,117,118,119,121,122
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
97,42,1,1,2,108,0,0,4,22,0,...,14,1,26,10,12,7,0,0,0,0
98,2,5,4,0,8,0,0,0,0,0,...,1,0,2,0,3,0,0,0,0,0
99,5,0,28,5,13,0,0,2,3,0,...,5,0,9,15,7,3,1,0,1,0
100,6,1,5,17,22,0,5,0,9,0,...,5,0,12,17,17,1,2,0,0,0
101,19,1,6,12,248,2,10,6,36,2,...,19,2,22,27,34,13,0,0,4,1
102,1,0,1,2,7,2,3,0,1,0,...,1,1,2,3,7,0,0,0,0,0
103,5,0,4,1,16,0,9,0,3,0,...,1,0,10,5,8,2,0,1,0,0
104,3,0,0,0,22,0,1,6,1,2,...,2,1,7,4,3,0,0,0,0,0
105,16,1,5,7,116,0,4,4,52,2,...,15,0,21,9,16,13,0,0,1,0
106,0,0,2,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
