In [15]:
import pandas as pd
import numpy as np

# Problem 2: a linguistic street map of Singapore

There is considerable linguistic variety in Singapore road names: Malaysians names (Jalan Besar), British names (Northumberland road), Chinese names (Keong Saik Road), Indian names (Veerasamy road), Jewish names (Belilios road), and the usual generic sorts of names that describe either area landmarks or other common noun.

![Image](streets.png)

In [3]:
# load the dataset
url = 'https://raw.githubusercontent.com/um-perez-alvaro/Data-Science-Practice/master/Data/streets.csv'
street_names = pd.read_csv(url)
street_names.head(10)

Unnamed: 0,name,tag,origin
0,Saiboo,Street,Indian
1,Merchant,Loop,Generic
2,Hill,Street,Generic
3,Ophir,Road,Malay
4,Buona,Vista Road,Other
5,Sengkang,Avenue,Chinese
6,Ang Mo Kio,Avenue,Chinese
7,Brickland,Road,British
8,Choa Chu Kang,Road,Chinese
9,Woodlands,Avenue,Generic


**``Origin`` column values:**

- Chinese (all dialects including Cantonese, Hokkien, Mandarin, etc)
- Malay
- Indian (all languages of the subcontinent)
- British
- Generic (Race Course Road, Sunrise Place, etc)
- Other (Other languages).

In [5]:
street_names.origin.value_counts()

Malay      927
British    798
Generic    497
Chinese    403
Other      167
Indian      42
Name: origin, dtype: int64

The **goal** is to predict the ``origin`` column from the ``street`` and ``tag`` columns.

**Part 1:** Add the following features:

- Number of words in road name (more words => more likely to be Chinese.) Assign it to a column named `n_words`.
- Average word length in road name (longer words => more likely to be British or Indian.) Assign it to a column named `avg_word_len`.
- Is the road tag Malay? (if yes => very correlated with being Malay.) **Malay tags**: *Jalan*, *Lorong*, *Bukit*, *Lengkok*, *Taman*, *Kampong*, *Lengkong*. Assign it to a column named `is_malay`.

In [4]:
def n_words_name(name):
    return len(name.split(' '))

In [11]:
street_names['n_words'] = street_names.name.apply(n_words_name)

In [14]:
def avg_word_length(name):
    words = name.split(' ')
    n_words = len(words)
    lengths = np.array([len(word) for word in words])
    return np.mean(lengths)

In [17]:
street_names['avg_len'] = street_names.name.apply(avg_word_length)

In [22]:
def is_malay(tag):
    return tag in  ['Jalan', 'Lorong', 'Bukit', 'Lengkok', 'Taman', 'Kampong', 'Lengkong']

In [25]:
street_names['is_malay'] = street_names.tag.apply(is_malay).astype(int)

In [29]:
street_names

Unnamed: 0,name,tag,origin,n_words,avg_len,is_malay
0,Saiboo,Street,Indian,1,6.0,0
1,Merchant,Loop,Generic,1,8.0,0
2,Hill,Street,Generic,1,4.0,0
3,Ophir,Road,Malay,1,5.0,0
4,Buona,Vista Road,Other,1,5.0,0
...,...,...,...,...,...,...
2829,Florence,Close,Chinese,1,8.0,0
2830,Hoot Kiam,Road,Chinese,2,4.0,0
2831,Clarke,Quay,British,1,6.0,0
2832,Countryside,Walk,Generic,1,11.0,0


**Part 2:** define the matrix X (columns `name`, `n_words`, `avg_word_len`, `is_malay`) and the target vector y (column `origin`)

In [58]:
X = street_names.drop(['tag','origin'],axis=1)
y = street_names.origin

**Part 3:** split X and y into training and testing sets.

In [59]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y)

**Part 4:** build a classification pipeline.

The pipeline must use `CountVectorizer` to compute the number of **character 1-gram, 2-grams, 3-grams and 4-grams** from the column `name` ([CountVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html#sklearn.feature_extraction.text.CountVectorizer) has an `analyzer` parameter that you'll have to set to `analyzer='char'`.)

In [60]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [62]:
name_processor = ColumnTransformer(transformers=[
    ('vect',CountVectorizer(analyzer='char',ngram_range=(1,4)),'name')
],remainder='passthrough')

pipe = Pipeline(steps=[
    ('name_processor',name_processor),
    ('clf', LogisticRegression(max_iter=1000))
])
pipe

Pipeline(steps=[('name_processor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('vect',
                                                  CountVectorizer(analyzer='char',
                                                                  ngram_range=(1,
                                                                               4)),
                                                  'name')])),
                ('clf', LogisticRegression(max_iter=1000))])

**Part 5:** Fit the classification pipeline to the training data, and  evaluate its performance on the testing set.

In [63]:
pipe.fit(X,y)

Pipeline(steps=[('name_processor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('vect',
                                                  CountVectorizer(analyzer='char',
                                                                  ngram_range=(1,
                                                                               4)),
                                                  'name')])),
                ('clf', LogisticRegression(max_iter=1000))])

In [64]:
y_pred = pipe.predict(X)

In [65]:
from sklearn.metrics import confusion_matrix,accuracy_score

In [66]:
confusion_matrix(y,y_pred)

array([[796,   0,   1,   0,   1,   0],
       [  0, 403,   0,   0,   0,   0],
       [  0,   0, 497,   0,   0,   0],
       [  0,   0,   0,  41,   1,   0],
       [  0,   0,   0,   0, 927,   0],
       [  0,   0,   0,   0,   2, 165]], dtype=int64)

In [67]:
accuracy_score(y,y_pred)

0.9982357092448836