In [1]:
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("dataset.csv")
df.head()
df.shape
df.info()
df.describe()
df.isnull().sum() #check if there are missing values in data

x = df["Text"]
y = df["language"]

FileNotFoundError: [Errno 2] No such file or directory: 'dataset.csv'

In [None]:
fig = plt.figure(figsize=(25, 5))
plt.bar(df["language"].unique(), list(df["language"].value_counts()), color='pink', width=0.2)
plt.xlabel("Language")
plt.ylabel("Amount of text in that language")
plt.show()
#we check how many texts in which language we have so we can see if we have a balanced data set.
#yes we do, each language has 1000 texts.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(x)
print(X.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)
x_rows, x_cols = X.shape
empty_features = np.zeros((1, x_cols))

In [None]:
# helper function for predicting the given text, we first need to vectorize the text then we can use it for prediction
def predict(clf, text, vectorizer, array):
    counter = 0
    for name in vectorizer.get_feature_names_out():
        if text.lower().count(name):
            array[0][counter] = text.lower().count(name)
        counter+= 1
    return clf.predict(array)

In [None]:
#MultinomialNB classifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

#hyperparameter tuning
clf = MultinomialNB()
param={'alpha': [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100,1000]}
grid = GridSearchCV(clf, param, scoring='accuracy', cv=10, return_train_score=True)
grid_search = grid.fit(X_train, y_train)
print(grid_search.best_params_)

In [None]:
#fitting the model with the best parameter
clf = MultinomialNB(alpha = 0.0001)
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

#predict any text, just assign your input to the text variable
text = ""
print(predict(clf, text, vectorizer, empty_features)) # you can predict any text just by changing the text variable

In [None]:
#OvR ROC Curve for MultinomialNB

from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import RocCurveDisplay

label_binarizer = LabelBinarizer().fit(y_train)
y_onehot_test = label_binarizer.transform(y_test)

y_score = clf.fit(X_train, y_train).predict_proba(X_test)

for lang in y.unique():
    class_of_interest = lang
    class_id = np.flatnonzero(label_binarizer.classes_ == class_of_interest)[0]
    RocCurveDisplay.from_predictions(
    y_onehot_test[:, class_id],
    y_score[:, class_id],
    name=f"{class_of_interest} vs the rest",
    color="darkorange",)

    plt.plot([0, 1], [0, 1], "k--", label="chance level (AUC = 0.5)")
    plt.axis("square")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("MultinomialNB\nOne-vs-Rest ROC curves:\n" + class_of_interest + " vs other languages")
    plt.legend()
    plt.show()

In [None]:
#KNN classifier
from sklearn.neighbors import KNeighborsClassifier

#hyperparameter tuning, find the best k value with GridSearch
knn = KNeighborsClassifier()
k_range = list(range(1, 21))
param_grid = dict(n_neighbors=k_range, metric=["cityblock", "cosine", "euclidean", "manhattan"])

grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy', return_train_score=False,verbose=1)
grid_search = grid.fit(X_train, y_train)
print(grid_search.best_params_)

In [None]:
knn = KNeighborsClassifier(n_neighbors=grid_search.best_params_["n_neighbors"], metric=grid_search.best_params_["metric"])
knn.fit(X_train, y_train)
print(knn.score(X_test, y_test))

In [None]:
#OvR ROC Curve for K-nearest neighbor

from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import RocCurveDisplay

label_binarizer = LabelBinarizer().fit(y_train)
y_onehot_test = label_binarizer.transform(y_test)

y_score = knn.fit(X_train, y_train).predict_proba(X_test)

for lang in y.unique():
    class_of_interest = lang
    class_id = np.flatnonzero(label_binarizer.classes_ == class_of_interest)[0]
    RocCurveDisplay.from_predictions(
    y_onehot_test[:, class_id],
    y_score[:, class_id],
    name=f"{class_of_interest} vs the rest",
    color="darkorange",)

    plt.plot([0, 1], [0, 1], "k--", label="chance level (AUC = 0.5)")
    plt.axis("square")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("K-nearest neighbor\nOne-vs-Rest ROC curves:\n" + class_of_interest + " vs other languages")
    plt.legend()
    plt.show()