In [3]:
import pandas as pd
import numpy as np

# Problem 2: a linguistic street map of Singapore

There is considerable linguistic variety in Singapore road names: Malaysians names (Jalan Besar), British names (Northumberland road), Chinese names (Keong Saik Road), Indian names (Veerasamy road), Jewish names (Belilios road), and the usual generic sorts of names that describe either area landmarks or other common noun.

![Image](streets.png)

In [22]:
# load the dataset
url = 'https://raw.githubusercontent.com/um-perez-alvaro/Data-Science-Practice/master/Data/streets.csv'
street_names = pd.read_csv(url)
street_names.head(10)

Unnamed: 0,name,tag,origin
0,Saiboo,Street,Indian
1,Merchant,Loop,Generic
2,Hill,Street,Generic
3,Ophir,Road,Malay
4,Buona,Vista Road,Other
5,Sengkang,Avenue,Chinese
6,Ang Mo Kio,Avenue,Chinese
7,Brickland,Road,British
8,Choa Chu Kang,Road,Chinese
9,Woodlands,Avenue,Generic


**``Origin`` column values:**

- Chinese (all dialects including Cantonese, Hokkien, Mandarin, etc)
- Malay
- Indian (all languages of the subcontinent)
- British
- Generic (Race Course Road, Sunrise Place, etc)
- Other (Other languages).

In [23]:
# number of words
name = 'Hoot Kiam Plume'
def get_number_of_words(name):
    return len(name.split(' '))

# avg word length
def get_avg_word_length(name):
    words = name.split(' ')
    return np.mean([len(word) for word in words])

# idk
def is_Malay(tag):
    return tag in ['Jalan', 'Lorong', 'Bukit', 'Lengkok', 'Taman', 'Kampong', 'Lengkong']
maylayTag = 'Jalan'
is_Malay(maylayTag)



True

In [24]:
street_names['number_of_words'] = street_names.name.apply(get_number_of_words)
street_names['avg_word_length'] = street_names.name.apply(get_avg_word_length)
street_names['is_malay'] = street_names.tag.apply(is_Malay)

In [25]:
street_names.head(1)

Unnamed: 0,name,tag,origin,number_of_words,avg_word_length,is_malay
0,Saiboo,Street,Indian,1,6.0,False


In [26]:
X = street_names.drop(['tag', 'origin'], axis=1)
y = street_names.origin


In [27]:
num_features = ['number_of_words', 'avg_word_length', 'is_malay']
text_features = ['name']

In [28]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [11]:
vect = CountVectorizer(analyzer='char', ngram_range=(1,2))

In [12]:
vect.fit_transform(X.name)

<2834x8262 sparse matrix of type '<class 'numpy.int64'>'
	with 64485 stored elements in Compressed Sparse Row format>

In [13]:
vect.get_feature_names_out()

array([' ', ' a', ' ab', ..., 'zubi', 'zuk', 'zuki'], dtype=object)

In [14]:
street_names.origin.value_counts()

Malay      927
British    798
Generic    497
Chinese    403
Other      167
Indian      42
Name: origin, dtype: int64

The **goal** is to predict the ``origin`` column from the ``street`` and ``tag`` columns.

**Part 1:** Add the following features:

- Number of words in road name (more words => more likely to be Chinese.) Assign it to a column named `n_words`.
- Average word length in road name (longer words => more likely to be British or Indian.) Assign it to a column named `avg_word_len`.
- Is the road tag Malay? (if yes => very correlated with being Malay.) **Malay tags**: *Jalan*, *Lorong*, *Bukit*, *Lengkok*, *Taman*, *Kampong*, *Lengkong*. Assign it to a column named `is_malay`.

**Part 2:** define the matrix X (columns `name`, `n_words`, `avg_word_len`, `is_malay`) and the target vector y (column `origin`)

In [15]:
street_names

Unnamed: 0,name,tag,origin,number_of_words,avg_word_length,is_malay
0,Saiboo,Street,Indian,1,6.0,False
1,Merchant,Loop,Generic,1,8.0,False
2,Hill,Street,Generic,1,4.0,False
3,Ophir,Road,Malay,1,5.0,False
4,Buona,Vista Road,Other,1,5.0,False
...,...,...,...,...,...,...
2829,Florence,Close,Chinese,1,8.0,False
2830,Hoot Kiam,Road,Chinese,2,4.0,False
2831,Clarke,Quay,British,1,6.0,False
2832,Countryside,Walk,Generic,1,11.0,False


In [16]:
X = street_names.drop(['tag', 'origin'], axis=1)
y = street_names.origin

In [17]:
X.head(3)

Unnamed: 0,name,number_of_words,avg_word_length,is_malay
0,Saiboo,1,6.0,False
1,Merchant,1,8.0,False
2,Hill,1,4.0,False


**Part 3:** split X and y into training and testing sets.

In [18]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y)

**Part 4:** build a classification pipeline.

The pipeline must use `CountVectorizer` to compute the number of **character 1-gram, 2-grams, 3-grams and 4-grams** from the column `name` ([CountVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html#sklearn.feature_extraction.text.CountVectorizer) has an `analyzer` parameter that you'll have to set to `analyzer='char'`.)

In [19]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1417, 4) (1417, 4) (1417,) (1417,)


In [42]:
X_train.head(3)

Unnamed: 0,name,number_of_words,avg_word_length,is_malay
427,Boundary,1,8.0,False
2591,Denham,1,6.0,False
2045,Pari Unak,2,4.0,True


In [76]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, PolynomialFeatures, LabelEncoder


num_features = ['number_of_words', 'avg_word_length', 'is_malay']
cat_features = ['name']

num_processor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler()),
])


feature_processor = ColumnTransformer(transformers = [
    #('encoder', LabelEncoder()),
    ('text_processor',CountVectorizer(analyzer='char',ngram_range=(1,4)),'name'),
    
],
remainder='passthrough'
)



### Warning: Pipe above not setup correctly. Trouble understanding the num_processor
error: not enough values to unpack (expected 3, got 2)

In [78]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

pipe = Pipeline(steps=[
    ('processor', feature_processor),
    ('forest', RandomForestClassifier(max_depth=5)),
])


pipe.fit(X_train,y_train)

**Part 5:** Fit the classification pipeline to the training data, and  evaluate its performance on the testing set.

In [85]:
from sklearn.model_selection import GridSearchCV
depth_list = [2,4,8,16,32,64]
param_dic = {'forest__max_depth': depth_list}

grid = GridSearchCV(pipe,param_dic, scoring='accuracy', cv=5, n_jobs=-1, verbose=1, error_score='raise')
grid.fit(X_train,y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


In [86]:
best_pipe = grid.best_estimator_
y_test_pred = best_pipe.predict(X_test)
from sklearn.metrics import confusion_matrix,accuracy_score
confusion_matrix(y_test,y_test_pred)

array([[368,   2,   7,   0,  22,   0],
       [  7, 193,   0,   0,   9,   0],
       [ 82,   3, 132,   0,  20,   0],
       [  8,   0,   0,   5,  10,   0],
       [ 15,   1,   3,   0, 446,   0],
       [ 27,   1,   2,   0,  25,  29]], dtype=int64)

In [89]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
accuracy_score(y_test,y_test_pred)

0.8278052223006351