# Letter prediction for English words

### What's the next letter, given 1, 2, 3, etc preceding letters?

In [1]:
from nltk.corpus import brown
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd
import string

In [2]:
sample_size = 1000 
words = list(set([w.lower() for w in brown.words()[:sample_size]]))
words = [word for word in words if word.isalpha()]

In [3]:
# DataFrame width: corresponds to the longest word
df_width = sorted(words, key=len)[-1]

In [4]:
# Process words into "bigrams" -- in this case letter pairs
def process_word(word):
    letters = list(word)
    d = []
    for i in range(len(letters) - 1):
        d.append((word, letters[i], letters[i+1]))
    d.append((word, letters[-1], '_'))
    return d       

In [5]:
all_words = []
for word in words:
    all_words.extend(process_word(word))
df = pd.DataFrame(all_words, columns = ['source', 'feature', 'target'])

In [6]:
df.head()

Unnamed: 0,source,feature,target
0,federal,f,e
1,federal,e,d
2,federal,d,e
3,federal,e,r
4,federal,r,a


In [7]:
df['feature_n'] = [ord(l) for l in df['feature']]
df['target_n'] = [ord(l) for l in df['target']]

In [8]:
df.head()

Unnamed: 0,source,feature,target,feature_n,target_n
0,federal,f,e,102,101
1,federal,e,d,101,100
2,federal,d,e,100,101
3,federal,e,r,101,114
4,federal,r,a,114,97


In [9]:
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75 

In [10]:
train, test = df[df['is_train']==True], df[df['is_train']==False]

In [11]:
features = df.columns[-3:-2]
clf = RandomForestClassifier(n_jobs=2)
y = train['target_n'] # Instead of y, _ = pd.factorize(train[1])

In [12]:
features

Index(['feature_n'], dtype='object')

In [13]:
clf.fit(train[features], y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [14]:
print(clf.predict(test[features]))

[105 117 110 114 101 101  95  95 114 108 101 116 116 110 114 101  97 101
 101 110  95 114 101  95 114 101  95  95  95 116 101 110  95  95  95 110
 110  95  97 116  95  95 101  95  95 101 101 101 116 110 110  95  95  95
 101 110  95  95 101  95 116  95 101 105  95  95 101 101 101  95 101 116
 116 101  97 101 116 114 110 110  95 110 114  95  95 116  95 101  95 110
  95 116 114 110  95 101  95 101  95 101 110 110  95  95  95 101  95 110
 101 101  95 114  95  95 114 116 110 101  95  95  95 105 110 101 101 116
 114 101  95 101 101 105  95  97  95 105 101 116  95  97  95 101  95  95
 101  95 101 110 114 101 101 101 101 101 116 101 110 114  95 110 101 114
 101 116 101  95 116  95 101  95 110 116 101  95 101 110  95 101 110  95
  95  95 110 110 101 110  95  95 101 110  95  95 112 114 101 110  95  95
 101 116 105 101 105 101  95 110  95 114 105 116 116  95 110 116 114  95
  95  95 101 101 101  95 110 101 110  95 116 101 101  95  95  95 101 101
 110  95  95 116 101  95 110 101  95 110 110 110 11

In [15]:
target_names = np.array([chr(x) for x in list(df['target_n'])])

In [16]:
test.head()

Unnamed: 0,source,feature,target,feature_n,target_n,is_train
0,federal,f,e,102,101,False
7,juries,j,u,106,117,False
10,juries,i,e,105,101,False
14,surveillance,u,r,117,114,False
19,surveillance,l,l,108,108,False


In [17]:
preds = clf.predict(test[features])
preds[:20]

array([105, 117, 110, 114, 101, 101,  95,  95, 114, 108, 101, 116, 116,
       110, 114, 101,  97, 101, 101, 110])

In [18]:
ct = pd.crosstab(test['target_n'], preds)
ct

col_0,95,97,101,105,108,110,112,114,116,117
target_n,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
95,59,2,17,2,0,3,0,0,0,0
97,11,5,15,2,1,1,0,1,0,0
98,0,0,2,0,0,2,0,1,0,0
99,4,0,1,0,0,6,0,0,3,0
100,11,0,5,0,0,2,0,1,4,0
101,20,3,37,5,3,2,1,9,0,0
102,1,0,0,3,0,2,0,0,0,0
103,9,0,0,0,0,3,0,0,0,0
104,2,0,6,0,0,0,0,0,0,0
105,6,1,25,0,0,1,0,1,2,0
