In [1]:
import pandas as pd

There is considerable linguistic variety in Singapore road names: Malaysians names (Jalan Besar), British names (Northumberland road), Chinese names (Keong Saik Road), Indian names (Veerasamy road), Jewish names (Belilios road), and the usual generic sorts of names that describe either area landmarks or other common noun.

![Image](Images/streets.png)

In [2]:
street_names = pd.read_csv('Singapore_street_names')
street_names.head(10)

Unnamed: 0,name,tag,classification
0,Saiboo,Street,Indian
1,Merchant,Loop,Generic
2,Hill,Street,Generic
3,Ophir,Road,Malay
4,Buona,Vista Road,Other
5,Sengkang,Avenue,Chinese
6,Ang Mo Kio,Avenue,Chinese
7,Brickland,Road,British
8,Choa Chu Kang,Road,Chinese
9,Woodlands,Avenue,Generic


- Chinese (all dialects including Cantonese, Hokkien, Mandarin, etc)
- Malay
- Indian (all languages of the subcontinent)
- British
- Generic (Race Course Road, Sunrise Place)
- Other (generally other languages).

In [54]:
street_names.classification.value_counts()

Malay      927
British    798
Generic    497
Chinese    403
Other      167
Indian      42
Name: classification, dtype: int64

In [64]:
street_names[street_names.classification=='Chinese'].head(20)

Unnamed: 0,name,tag,classification
5,Sengkang,Avenue,Chinese
6,Ang Mo Kio,Avenue,Chinese
8,Choa Chu Kang,Road,Chinese
12,Nanyang,Walk,Chinese
15,Kee Sun,Avenue,Chinese
21,Boon Tat,Link,Chinese
26,Binkiang,Lorong,Chinese
28,Sims,Avenue,Chinese
29,Hougang,Avenue,Chinese
44,Whampoa,Drive,Chinese


In [67]:
'malay_tag columns'
malay_prefix_tags = ["Jalan", "Lorong", "Bukit", "Lengkok", "Taman", "Kampong", "Lengkong"]
street_names['malay_tag'] = street_names.tag.isin(malay_prefix_tags).astype(float)

In [68]:
street_names.malay_tag.value_counts()

0.0    2367
1.0     467
Name: malay_tag, dtype: int64

In [4]:
street_names.head()

Unnamed: 0,name,tag,classification
0,Saiboo,Street,Indian
1,Merchant,Loop,Generic
2,Hill,Street,Generic
3,Ophir,Road,Malay
4,Buona,Vista Road,Other


## New Features

In [178]:
'malay_tag columns'
malay_prefix_tags = ["Jalan", "Lorong", "Bukit", "Lengkok", "Taman", "Kampong", "Lengkong"]
street_names['malay_tag'] = street_names.tag.isin(malay_prefix_tags).astype(float)
street_names.head()

Unnamed: 0,name,tag,classification,malay_tag
0,Saiboo,Street,Indian,0.0
1,Merchant,Loop,Generic,0.0
2,Hill,Street,Generic,0.0
3,Ophir,Road,Malay,0.0
4,Buona,Vista Road,Other,0.0


In [181]:
street_names['len'] = street_names.name.str.len()
street_names.head()

Unnamed: 0,name,tag,classification,malay_tag,len
0,Saiboo,Street,Indian,0.0,6
1,Merchant,Loop,Generic,0.0,8
2,Hill,Street,Generic,0.0,4
3,Ophir,Road,Malay,0.0,5
4,Buona,Vista Road,Other,0.0,5


In [184]:
street_names['avg_word_len'] = street_names.name.apply(lambda x: np.mean([len(w) for w in x.split()]))
street_names.head(10)

Unnamed: 0,name,tag,classification,malay_tag,len,avg_word_len
0,Saiboo,Street,Indian,0.0,6,6.0
1,Merchant,Loop,Generic,0.0,8,8.0
2,Hill,Street,Generic,0.0,4,4.0
3,Ophir,Road,Malay,0.0,5,5.0
4,Buona,Vista Road,Other,0.0,5,5.0
5,Sengkang,Avenue,Chinese,0.0,8,8.0
6,Ang Mo Kio,Avenue,Chinese,0.0,10,2.666667
7,Brickland,Road,British,0.0,9,9.0
8,Choa Chu Kang,Road,Chinese,0.0,13,3.666667
9,Woodlands,Avenue,Generic,0.0,9,9.0


In [185]:
street_names['num_words'] = street_names.name.apply(lambda x: len(x.split()))
street_names.head(10)

Unnamed: 0,name,tag,classification,malay_tag,len,avg_word_len,num_words
0,Saiboo,Street,Indian,0.0,6,6.0,1
1,Merchant,Loop,Generic,0.0,8,8.0,1
2,Hill,Street,Generic,0.0,4,4.0,1
3,Ophir,Road,Malay,0.0,5,5.0,1
4,Buona,Vista Road,Other,0.0,5,5.0,1
5,Sengkang,Avenue,Chinese,0.0,8,8.0,1
6,Ang Mo Kio,Avenue,Chinese,0.0,10,2.666667,3
7,Brickland,Road,British,0.0,9,9.0,1
8,Choa Chu Kang,Road,Chinese,0.0,13,3.666667,3
9,Woodlands,Avenue,Generic,0.0,9,9.0,1


## Machine learning

In [227]:
X = street_names[['name','malay_tag','len','avg_word_len','num_words']]
y = street_names['classification']
X.head()

Unnamed: 0,name,malay_tag,len,avg_word_len,num_words
0,Saiboo,0.0,6,6.0,1
1,Merchant,0.0,8,8.0,1
2,Hill,0.0,4,4.0,1
3,Ophir,0.0,5,5.0,1
4,Buona,0.0,5,5.0,1


In [198]:
from sklearn.compose import make_column_transformer
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(ngram_range=(1,4), analyzer='char')
column_trans = make_column_transformer(
    (vect, 'name'),
    remainder='passthrough')

In [228]:
XX = column_trans.fit_transform(X).toarray()

In [229]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(XX,y)

In [230]:
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC
clf = LinearSVC()
pipe = make_pipeline(column_trans, clf)

In [231]:
X_train

array([[0., 0., 0., ..., 8., 8., 1.],
       [0., 0., 0., ..., 9., 9., 1.],
       [0., 0., 0., ..., 5., 5., 1.],
       ...,
       [0., 0., 0., ..., 5., 5., 1.],
       [0., 0., 0., ..., 5., 5., 1.],
       [0., 0., 0., ..., 6., 6., 1.]])

In [235]:
clf.fit(X_train,y_train)



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [236]:
y_test_pred = clf.predict(X_test)

In [238]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [237]:
confusion_matrix(y_test,y_test_pred)

array([[163,   3,  16,   0,   1,   5],
       [  0, 117,   0,   0,   3,   0],
       [ 21,   1,  98,   0,   3,   2],
       [  2,   0,   1,   6,   2,   0],
       [  5,   3,   1,   0, 218,   2],
       [  7,   0,   6,   0,   4,  19]], dtype=int64)

In [239]:
accuracy_score(y_test,y_test_pred)

0.8758815232722144

## Pipeline

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
FunctionTransformer


The features we'll be adding are these:

- Number of words in road name: More words => more likely to be Chinese
- Average word length in road name: Longer words => more likely to be British or Indian
- Are all words in dictionary: If yes => likely to be Generic
- Is the road type Malay? If yes => very correlated with being Malay

In [214]:
y = street_names['classification']

In [215]:
y.head()

0     Indian
1    Generic
2    Generic
3      Malay
4      Other
Name: classification, dtype: object

In [5]:
X = street_names[['name','tag']]
X.head()

Unnamed: 0,name,tag
0,Saiboo,Street
1,Merchant,Loop
2,Hill,Street
3,Ophir,Road
4,Buona,Vista Road


In [51]:
X.tag.fillna('None',inplace=True)

In [201]:
def get_len(X):
    'lenght of each name'
    return np.expand_dims([len(x) for x in X],axis=1)
def get_avg_word_len(X):
    'average word length'
    return np.expand_dims([np.mean([len(w) for w in x.split()]) for x in X], axis=1)
    #return np.X.apply(lambda x: np.mean([len(w) for w in x.split()]))
def get_num_words(X):
    'number of words'
    return np.expand_dims([len(x.split()) for x in X],axis=1)
    #return X.apply(lambda x: len(x.split()))

In [207]:
def is_malay(X):
    malay_prefix_tags = ["Jalan", "Lorong", "Bukit", "Lengkok", "Taman", "Kampong", "Lengkong"]
    return np.expand_dims([x in malay_prefix_tags for x in X],axis=1).astype(float)

In [245]:
transformer1 = FunctionTransformer(get_len)
transformer2 = FunctionTransformer(get_num_words)
transformer3 = FunctionTransformer(get_avg_word_len)

vect = CountVectorizer(ngram_range=(1,4), analyzer='char')


name_transform = FeatureUnion([
    ('count', vect),
    ('get_len', transformer1),
    ('get_num_words', transformer2),
    ('get_avg_word_len', transformer3)
])

transformer4 = FunctionTransformer(is_malay)

col_trans = ColumnTransformer([
    ('name_features', name_transform, 'name'),
    ('is_Malay',transformer4, 'tag')
])

In [219]:
col_trans.fit_transform(X).toarray()

array([[ 0.,  0.,  0., ...,  1.,  6.,  0.],
       [ 0.,  0.,  0., ...,  1.,  8.,  0.],
       [ 0.,  0.,  0., ...,  1.,  4.,  0.],
       ...,
       [ 0.,  0.,  0., ...,  1.,  6.,  0.],
       [ 0.,  0.,  0., ...,  1., 11.,  0.],
       [ 0.,  0.,  0., ...,  1.,  5.,  0.]])

In [246]:
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC
clf = LinearSVC()
pipe = make_pipeline(col_trans, clf)

In [212]:
X.head()

Unnamed: 0,name,tag
0,Saiboo,Street
1,Merchant,Loop
2,Hill,Street
3,Ophir,Road
4,Buona,Vista Road


In [251]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y)

In [252]:
pipe.fit(X_train,y_train)
y_test_pred = pipe.predict(X_test)



In [253]:
from sklearn.metrics import confusion_matrix, accuracy_score
confusion_matrix(y_test,y_test_pred)

array([[179,   1,  17,   0,   7,   2],
       [  0, 104,   0,   0,   7,   0],
       [ 21,   3,  83,   1,   2,   9],
       [  1,   1,   0,   7,   1,   1],
       [  5,   1,   5,   0, 207,   3],
       [ 10,   1,   2,   0,   4,  24]], dtype=int64)

In [250]:
accuracy_score(y_test,y_test_pred)

0.847672778561354

In [254]:
X_test[y_test!=y_test_pred]

Unnamed: 0,name,tag
1693,Second,Street
1618,Eber,Road
1624,Sinaran,Drive
346,Canal,Road
1104,Pitt,Street
...,...,...
1633,Keris,Drive
617,Malacca,Street
880,Swan Lake,Avenue
2728,Wallace,Way
