In [13]:
import matplotlib.pyplot as plt  
import numpy as np
import pandas as pd
import re
import seaborn as sns
import tensorflow as tf

from keras.preprocessing.text import Tokenizer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Context
Original Kaggle dataset can be found here: https://www.kaggle.com/datasets/basilb2s/language-detection
Language detection models for Kaggle's language detection dataset, used to predict a language given input text.

Languages include:

- English
- Malayalam
- Hindi
- Tamil
- Kannada
- French
- Spanish
- Portuguese
- Italian
- Russian
- Swedish
- Dutch
- Arabic
- Turkish
- German
- Danish
- Greek

**Goal:** Goal is primarily to have a playground to learn and explore different models, test my own Numpy implemented models, and practice data processing/analysis as I study ML.

**Status:** Currently includes three models: Sklearn's logistic regression, a basic Tensorflow multilayer perceptron neural network, and my own softmax regression implementation in Numpy. I plan to explore Multinomial Naive Bayes next. 

Note: I accidentally included the entire dataset in the BOW model, so I need to go back to UNK/smooth.

# Data Exploration

In [17]:
data = pd.read_csv('C:/Users/tmmet/Jupyter Notebooks/language-detection/language_detection.csv')
data.dropna()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English
...,...,...
10332,ನಿಮ್ಮ ತಪ್ಪು ಏನು ಬಂದಿದೆಯೆಂದರೆ ಆ ದಿನದಿಂದ ನಿಮಗೆ ಒ...,Kannada
10333,ನಾರ್ಸಿಸಾ ತಾನು ಮೊದಲಿಗೆ ಹೆಣಗಾಡುತ್ತಿದ್ದ ಮಾರ್ಗಗಳನ್...,Kannada
10334,ಹೇಗೆ ' ನಾರ್ಸಿಸಿಸಮ್ ಈಗ ಮರಿಯನ್ ಅವರಿಗೆ ಸಂಭವಿಸಿದ ಎ...,Kannada
10335,ಅವಳು ಈಗ ಹೆಚ್ಚು ಚಿನ್ನದ ಬ್ರೆಡ್ ಬಯಸುವುದಿಲ್ಲ ಎಂದು ...,Kannada


In [18]:
data['Language'].value_counts()

English       1385
French        1014
Spanish        819
Portugeese     739
Italian        698
Russian        692
Sweedish       676
Malayalam      594
Dutch          546
Arabic         536
Turkish        474
German         470
Tamil          469
Danish         428
Kannada        369
Greek          365
Hindi           63
Name: Language, dtype: int64

In [19]:
for language in data['Language'].unique():
    print(data.loc[data['Language'] == language].head(1))

                                                Text Language
0   Nature, in the broadest sense, is the natural...  English
                                                   Text   Language
1385  ഭൗതികപ്രപഞ്ചത്തെ മൊത്തത്തിൽ സൂചിപ്പിക്കുന്ന പദ...  Malayalam
                                                   Text Language
1979  विकि-शब्दकोष (एक मुक्त शब्दकोष एवं समानांतर को...    Hindi
                                                   Text Language
2042  விக்கிப்பீடியா (Wikipedia; /ˌwɪkɪˈpiːdiə/ (கேட...    Tamil
                                                   Text    Language
2511  Nature é uma revista científica interdisciplin...  Portugeese
                                                   Text Language
3250  Si vous disposez d'ouvrages ou d'articles de r...   French
                                                   Text Language
4264  Nature (Engels voor 'natuur') is een Brits voo...    Dutch
                                                   Text Language
4810  Nature es una d

# Data Preprocessing

In [20]:
# Cleaning text
texts = data['Text']
cleaned = []
for txt in texts:
    txt = re.sub(r'[\([{})\]!@#$,"%^*?:;~`]', ' ', txt)
    txt = re.sub(r"\\|[0-9]|/|-|_|'|\.", '', txt)
    txt = re.sub('\s+', ' ', txt)  
    txt = txt.lower()
    cleaned.append(txt)

data['text_cleaned'] = cleaned

**Note: Realized I accidentally included the entire dataset in the BOWs implementation, skewing the accuracies for unseen data. I need to go back to UNK OOV and smooth.**

In [None]:
# Tokenization & Vectorization
tokenizer = Tokenizer(oov_token="<UNK>")
tokenizer.fit_on_texts(texts)  # word -> index
print(f'Vocabulary, size {len(tokenizer.word_index) + 1}: {list(tokenizer.word_index.keys())[:50]}...')

vectors = tokenizer.texts_to_matrix(texts, mode='count')  # bag of words
print(f'\nVector shape: {vectors.shape}')  # data samples x vocab size

In [None]:
Y = data['Language'].to_numpy().reshape(-1,1)
enc = OneHotEncoder(sparse=False).fit(Y)

# Create train/test splits
X_train, X_test, Y_train, Y_test = train_test_split(vectors, Y, test_size=0.2, random_state=42)
X_train, X_test = X_train.T, X_test.T  # (d, n) d features x n samples

# Convert labels to one-hot encodings
Y_train_hot, Y_test_hot = enc.transform(Y_train).T, enc.transform(Y_test).T

print(f'X_train shape: {X_train.shape}, Y_train_hot shape: {Y_train_hot.shape}; X_test shape: {X_test.shape}, Y_test_hot shape: {Y_test_hot.shape}')

# Scikit Learn Logistic Regression Model

In [None]:
def sklearn_logistic_regression(X, Y):
    lr_model = LogisticRegression()
    lr_model.fit(X, Y)

    print(f'Accuracy on training set: {lr_model.score(X_train.T, Y_train.reshape(-1))}')
    print(f'Accuracy on test set: {lr_model.score(X_test.T, Y_test.reshape(-1))}')

sklearn_logistic_regression(X_train.T, Y_train.reshape(-1))

# My Numpy Softmax Regression Model

**Model Implementation**

In [None]:
def linear(X, W, b):
    """Calculates the linear model.
    
    Args:
        X: (d, n) d data samples by n features
        W: (d, k) weight parameters for model, d feature dimensions by k classes
        b: (k,) bias parameters for k classes
    
    Returns:
        Z: (k, n) applied linear model for k classes by n data samples
    """
    Z = np.matmul(W.T, X) + b.reshape(-1, 1)
    return Z

In [None]:
def softmax(Z):
    """Defines the softmax function.
    
    Args:
        Z: (k, n) applied linear model for k classes by n data samples
    
    Returns:
        softmax: (k, n) estimated probability hypotheses P(y=i|x;w,b)
            for every i = 1,...,k target for all n samples ie.
            softmax[k][i] = P(y_i=k|x;w,b), probability that the ith sample belongs to class k
    """
    denominator = (np.exp(Z)).sum(axis=0)
    numerator = np.exp(Z)
    softmax = numerator / denominator
    
    return softmax

In [None]:
def compute_cost(H, Y):
    """Computes total cost.
    
    Args:
        H: (k, n) estimated softmax probability hypotheses s.t. H[k][i] = P(y_i=k|x;w,b),
            columns correspond to the likelihood the ith column belongs to the kth class
        Y: (k, n) one-hot encoded targets of k classes by n data samples
    
    Returns:
        cost: (scalar) cost
    """
    loss = (np.multiply(Y, np.log(H))).sum(axis=0)  # (n,) loss for n samples
    cost = -loss.sum()  # (scalar) total cost
    
    return cost

In [None]:
def gradient_descent(X, Y, H, b):
    """Computes the gradient for softmax regression.

    Args:
        X: (d, n) d features by n data samples
        Y: (k, n) one-hot encoded targets for k classes by n data samples
        H: (k, n) estimated softmax probability hypotheses s.t. H[k][i] = P(y_i=k|x;w,b)
        b: (k,) bias parameters for k classes

    Returns:
        dj_dw: (d, k) gradients of the cost w.r.t. parameters W
        dj_db: (k,) gradients of cost w.r.t parameter b
    """
    n = X.shape[1]
    k = Y.shape[0]
    dj_dw = -(1 / n) * np.dot(X, (Y - H).T)
    dj_db = -(1 / n) * np.sum(Y - H, axis=1)
    
    return dj_dw, dj_db.reshape(-1)

In [None]:
def fit(X, Y, alpha, epochs, k):
    """Trains the softmax regression model.
    
    Args: 
        X: (d, n) n data samples by d features
        Y: (k, n) one-hot encoded targets for n data samples
        alpha: (scalar) learning rate for gradient descent
        epochs: (scalar) number of iterations to train
        k: (scalar) number of classes
        
    Returns:
        W: (d, k) fitted parameters for d features by k classes
        b: (k,) fitted bias parameter for k classes
    """
    d, n = X.shape  # num features x num samples
    
    W_init = np.zeros((d, k))
    W = W_init
    b = np.zeros((k,))
    costs = []
    
    for i in range(epochs):
        Z = linear(X, W, b)
        H = softmax(Z)
        
        cost = compute_cost(H, Y)
        
        dj_dw, dj_db = gradient_descent(X, Y, H, b)
        
        W = W - alpha * dj_dw
        b = b - alpha * dj_db
        
        if i % 100 == 0:
            costs.append({'epoch': i, 'cost': cost})
            print(f'epoch: {i}, cost: {cost}')
    
    return W, b.reshape(-1)

In [None]:
def predict(X, W, b, enc):
    """Makes predictions for X given W, b.
    
    Args: 
        X: (d, n) n data samples by d features
        W: (d, k) parameters for d features by k classes
        b: (k,) bias parameter for k classes
        enc: OneHotEncoder used to encode Y
        
    Returns:
        Y_hat: (n,) predictions for X
    """
    Z = linear(X, W, b)
    H = softmax(Z)
    
    Y_hat = enc.inverse_transform(H.T)
    return Y_hat

In [None]:
def create_confusion_matrix(Y, Y_hat, title):
    """Creates a confusion matrix between Y true and Y predict."""
    Y, Y_hat = Y.reshape(-1), Y_hat.reshape(-1)
    classes = np.unique(Y)
    
    cm = confusion_matrix(Y, Y_hat)
    
    plt.figure(figsize = (15,10))
    ax = sns.heatmap(cm, annot=True, cmap=sns.cubehelix_palette(), xticklabels=classes, \
                     yticklabels=classes, linewidths=0.005, fmt='g')
    
    ax.set_xlabel('Predict', fontsize=12, labelpad=15)
    ax.set_ylabel('True', fontsize=12, labelpad=15)
    ax.set_title(title, fontsize=14, pad=20)

In [None]:
# Use this for simple model debugging purposes
def simple():
    X_simple = np.matrix([[1, 2, 3],
                        [4, 5, 6]])  # 2 features x 3 samples
    Y_simple = np.matrix([[0, 0, 1],
                         [0, 1, 0],
                         [0, 0, 0],
                         [1, 0, 0]])
    W_simple = np.matrix([[1, 2, 3, 4],
                         [5, 6, 7, 8]])  # 2 features x 4 classes
    b_simple = np.array([1, 2, 3, 4])  # bias for 4 classes

    Z_simple = linear(X_simple, W_simple, b_simple)
    H_simple = softmax(Z_simple)
    k_simple = Y_simple.shape[0]
    W_simple_fit, b_simple_fit = fit(X_simple, Y_simple, 0.01, 10000, k_simple)

    print(f'trained W: {W_simple_fit}')
    print(f'trained b: {b_simple_fit}')

    def predict_simple(X, W, b):
        Z = linear(X, W, b)
        H = softmax(Z)

        y_hat = np.argmax(H, axis=0)
        return y_hat

    print(f'predictions: {predict_simple(X_simple, W_simple_fit, b_simple_fit)}')

# simple()

**Training and Testing the Model**

In [None]:
# Train the model
alpha = 0.07  # learning rate
epochs = 15000
k = len(data['Language'].unique())  # num classes

print(f'Training... alpha={alpha}, epochs={epochs}')

W, b = fit(X_train, Y_train_hot, alpha, epochs, k)

print('\nDone.')

In [None]:
# Print Y train performance
print('Train set performance with Numpy Softmax Regression model:\n')
print(f'X_train shape: {X_train.shape}, Y_train_hot shape: {Y_train_hot.shape}\n')

Y_train_hat = predict(X_train, W, b, enc)

print(classification_report(Y_train, Y_train_hat, zero_division=0))

In [None]:
# Print Y test performance
print('Test set performance with Numpy Softmax Regression model:\n')
print(f'X_test shape: {X_test.shape}, Y_test_hot shape: {Y_test_hot.shape}\n')

Y_test_hat = predict(X_test, W, b, enc)

print(classification_report(Y_test, Y_test_hat, zero_division=0))

In [None]:
# Draw the confusion matrix for Y test
create_confusion_matrix(Y_test, Y_test_hat, title='Confusion matrix for Softmax Regression Model for Language Detection, Y Test')

# Tensorflow Neural Network Model

In [None]:
def tensorflow_model(X, Y, k, epochs):
    """Fits a simple Tensorflow NN to X, Y.
    
    Args:
        X: Input data
        Y: Target values, one-hot encoded
        k: Number of classes
        epochs: Number of iterations
    
    Returns:
        tf_model: The Tensorflow NN model
    """
    tf_model = Sequential([Dense(50, activation='relu'),
                          Dense(25, activation='relu'),
                          Dense(k, activation='linear')])
    tf_model.compile(loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                    optimizer=tf.keras.optimizers.Adam(0.001))
    tf_model.fit(X, Y, epochs=epochs)
    
    return tf_model

In [None]:
def tensorflow_model_predict(X, tf_model):
    """Makes predictions on X with tf_model.
    
    Args:
        X: data to predict on
        tf_model: Tensorflow model
    
    Returns:
        Y_hat: Categorical predictions on X
    """
    Y_pred = tf_model.predict(X)
    Y_pred_sm = tf.nn.softmax(Y_pred).numpy()
    Y_hat = enc.inverse_transform(Y_pred_sm)
    
    return Y_hat

In [None]:
# Train tensorflow model
print('Training Tensorflow model...')
tf_model = tensorflow_model(X_train.T, Y_train_hot.T, k = len(data['Language'].unique()), epochs=10)

In [None]:
print('Train set performance with Tensorflow model:\n')

Y_train_hat_tf = tensorflow_model_predict(X_train.T, tf_model)
print(classification_report(Y_train, Y_train_hat_tf, zero_division=0))

In [None]:
print('Test set performance with Tensorflow model:\n')

Y_test_hat_tf = tensorflow_model_predict(X_test.T, tf_model)
print(classification_report(Y_test, Y_test_hat_tf, zero_division=0))