# Hackathon

## 1. Imports:

In [28]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

# Libraries for data preparation and model building
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, confusion_matrix  # Classification metrics
from sklearn.model_selection import train_test_split, GridSearchCV  # Train-test split and grid search
from sklearn.linear_model import LogisticRegression  # Logistic Regression classifier for machine learning
from sklearn.tree import DecisionTreeClassifier  # Decision Tree classifier for machine learning
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier  # Random Forest classifier for machine learning
from sklearn.svm import LinearSVC, SVC  # Support Vector Machine classifiers
from sklearn.naive_bayes import GaussianNB, MultinomialNB  # Naive Bayes classifiers
from sklearn.ensemble import BaggingClassifier  # Bagging classifier
from sklearn.ensemble import ExtraTreesClassifier  # Extra Trees classifier
from sklearn.ensemble import VotingClassifier  # Voting classifier
from sklearn.neighbors import KNeighborsClassifier
from scipy.sparse import hstack  # Used for stacking sparse matrices horizontally
import pickle  # Serialization library
from sklearn.utils import resample  # Resampling tool
from sklearn import feature_selection  # Feature selection module
from sklearn.feature_selection import f_classif  # Feature selection using F-statistic
from sklearn import preprocessing  # Data preprocessing
import pickle  # Serialization library

from scipy.sparse import issparse
from sklearn import metrics

# Feature selection Libraries:
from sklearn.feature_selection import SelectKBest  # To reduce features
from sklearn.feature_selection import chi2  # Used to estimate which features are most impactful

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Flags for notebook Execution
VECTORIZER_TO_USE = "count"  # Chooses between TfIDF vectorizer or Count Vectorizer - accepted values are "tfidf" or "count"

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Tune_bdx\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Tune_bdx\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Tune_bdx\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## 2. Load DataFrames:

In [29]:
# Read training data from CSV file into a DataFrame
df_train = pd.read_csv('train_set.csv')

# Read test data without labels from CSV file into a DataFrame
df_test = pd.read_csv('test_set.csv')

# 3. Cleaning Data:

In [30]:
df_train.shape

(33000, 2)

In [31]:
df_test.shape

(5682, 2)

In [32]:

def nlp_preprocessing(texts):
    # Convert to lowercase
    texts = [text.lower() for text in texts]

    # Remove numbers
    texts = [re.sub(r'\d+', '', text) for text in texts]

    # Remove punctuation
    texts = [text.translate(str.maketrans("", "", string.punctuation)) for text in texts]

    # Tokenize the text
    words = [word_tokenize(text) for text in texts]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [[word for word in doc if word not in stop_words] for doc in words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [[lemmatizer.lemmatize(word) for word in doc] for doc in words]

    # Join the cleaned words back into a single string
    cleaned_texts = [' '.join(doc) for doc in words]

    return cleaned_texts

In [33]:
df_train['text'] = nlp_preprocessing(df_train['text'])

In [34]:
df_train

Unnamed: 0,lang_id,text
0,xho,umgaqosiseko wenza amalungiselelo kumaziko axh...
1,xho,idha iya kuba nobulumko bokubeka umsebenzi nap...
2,eng,province kwazulunatal department transport inv...
3,nso,netefatša gore ba file dilo ka moka tše le dum...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...
...,...,...
32995,tsn,popo ya dipolateforomo tse ke go tlisa boetele...
32996,sot,modise mosadi na ntse sa utlwe hore thabang ra...
32997,eng,closing date submission completed tender augus...
32998,xho,nawuphina umntu ofunyenwe enetyala phantsi kwa...


In [35]:
df_test['text'] = nlp_preprocessing(df_test['text'])

In [36]:
df_test

Unnamed: 0,index,text
0,1,mmasepala fa maemo kgethegileng letlelela kgat...
1,2,uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,tshivhumbeo tshi fana na ngano dza vhathu
3,4,kube inja nelikati betingevakala kutsi titsini...
4,5,winste op buitelandse valuta
...,...,...
5677,5678,mark ballot private
5678,5679,ge ka kgetha ka bowena go se šomiše mofani ka ...
5679,5680,e ka kopo etsa kgetho ya hao ka hloko hobane h...
5680,5681,tb ke bokudi ba pmb mme morero tla lefella tlh...


In [37]:
# Assuming df_train is your DataFrame and 'lang_id' is the column you want to encode
le = LabelEncoder()
df_train['lang_id_encoded'] = le.fit_transform(df_train['lang_id'])

# Display the mapping between original categories and encoded values
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Label Mapping:", label_mapping)

Label Mapping: {'afr': 0, 'eng': 1, 'nbl': 2, 'nso': 3, 'sot': 4, 'ssw': 5, 'tsn': 6, 'tso': 7, 'ven': 8, 'xho': 9, 'zul': 10}


In [38]:
df_train

Unnamed: 0,lang_id,text,lang_id_encoded
0,xho,umgaqosiseko wenza amalungiselelo kumaziko axh...,9
1,xho,idha iya kuba nobulumko bokubeka umsebenzi nap...,9
2,eng,province kwazulunatal department transport inv...,1
3,nso,netefatša gore ba file dilo ka moka tše le dum...,3
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...,8
...,...,...,...
32995,tsn,popo ya dipolateforomo tse ke go tlisa boetele...,6
32996,sot,modise mosadi na ntse sa utlwe hore thabang ra...,4
32997,eng,closing date submission completed tender augus...,1
32998,xho,nawuphina umntu ofunyenwe enetyala phantsi kwa...,9


In [39]:
# Extract index that will be used when submitting predictions based on the evaluate dataset
df_index_submission = pd.DataFrame(df_train['lang_id'])
df_index_submission

Unnamed: 0,lang_id
0,xho
1,xho
2,eng
3,nso
4,ven
...,...
32995,tsn
32996,sot
32997,eng
32998,xho


In [40]:
# Dropping the message column since it has been vectorized
df_train = df_train.drop("lang_id", axis='columns')
df_train.head()

Unnamed: 0,text,lang_id_encoded
0,umgaqosiseko wenza amalungiselelo kumaziko axh...,9
1,idha iya kuba nobulumko bokubeka umsebenzi nap...,9
2,province kwazulunatal department transport inv...,1
3,netefatša gore ba file dilo ka moka tše le dum...,3
4,khomishini ya ndinganyiso ya mbeu yo ewa maana...,8


In [41]:
# Initialize and fit specified Vectorizer
if VECTORIZER_TO_USE == "tfidf":

    # Initialize Vectorizer
    tfid_train = TfidfVectorizer(ngram_range=(4,5), analyzer='char', min_df=1, max_df =0.45, max_features=50000) # Change max features to include more data

    # Fit vectoriser on text data:
    vec_text_train = tfid_train.fit_transform(df_train["text"])

    # Transform both test set for 'tfid'
    vec_text_test = tfid_train.transform(df_test["text"])
    
elif VECTORIZER_TO_USE == "count":

    # Initialize Vectorizer
    count_vec_train = CountVectorizer(ngram_range=(4,5), analyzer='char', min_df=1, max_df =0.45, max_features=50000) # Change max features to include more data

    # Fit vectoriser on text data:
    vec_text_train = count_vec_train.fit_transform(df_train["text"])

    # Transform both test set for 'count'
    vec_text_test = count_vec_train.transform(df_test["text"])

In [42]:
# Converting vectorized text into sparse dataframe
if VECTORIZER_TO_USE == "tfidf":
    sparse_vec_msg_train = pd.DataFrame.sparse.from_spmatrix(vec_text_train, columns = tfid_train.get_feature_names_out())
elif VECTORIZER_TO_USE == "count":
    sparse_vec_msg_train = pd.DataFrame.sparse.from_spmatrix(vec_text_train, columns = count_vec_train.get_feature_names_out())
sparse_vec_msg_train.head()

Unnamed: 0,aan,aan.1,aand,aang,aans,aant,aanv,aba,aba.1,abab,...,ṱo ḓ,ṱo ḓa,ṱuku,ṱuku.1,ṱun,ṱun ḓ,ṱuwe,ṱuwed,ṱuṱu,ṱuṱuw
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
# Converting vectorized text into sparse dataframe
if VECTORIZER_TO_USE == "tfidf":
    sparse_vec_msg_test = pd.DataFrame.sparse.from_spmatrix(vec_text_test, columns = tfid_train.get_feature_names_out())
elif VECTORIZER_TO_USE == "count":
    sparse_vec_msg_test = pd.DataFrame.sparse.from_spmatrix(vec_text_test, columns = count_vec_train.get_feature_names_out())
sparse_vec_msg_test.head()

Unnamed: 0,aan,aan.1,aand,aang,aans,aant,aanv,aba,aba.1,abab,...,ṱo ḓ,ṱo ḓa,ṱuku,ṱuku.1,ṱun,ṱun ḓ,ṱuwe,ṱuwed,ṱuṱu,ṱuṱuw
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [44]:
# Adding the dataframes together
df_vectorized_clean = pd.concat([df_train.reset_index(drop=True), sparse_vec_msg_train.reset_index(drop=True)], axis=1)
df_vectorized_clean.head()

Unnamed: 0,text,lang_id_encoded,aan,aan.1,aand,aang,aans,aant,aanv,aba,...,ṱo ḓ,ṱo ḓa,ṱuku,ṱuku.1,ṱun,ṱun ḓ,ṱuwe,ṱuwed,ṱuṱu,ṱuṱuw
0,umgaqosiseko wenza amalungiselelo kumaziko axh...,9,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,idha iya kuba nobulumko bokubeka umsebenzi nap...,9,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,province kwazulunatal department transport inv...,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,netefatša gore ba file dilo ka moka tše le dum...,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,khomishini ya ndinganyiso ya mbeu yo ewa maana...,8,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [45]:
# Adding the dataframes together
df_vectorized_test_clean = pd.concat([df_test.reset_index(drop=True), sparse_vec_msg_test.reset_index(drop=True)], axis=1)
df_vectorized_test_clean.head()

Unnamed: 0,index,text,aan,aan.1,aand,aang,aans,aant,aanv,aba,...,ṱo ḓ,ṱo ḓa,ṱuku,ṱuku.1,ṱun,ṱun ḓ,ṱuwe,ṱuwed,ṱuṱu,ṱuṱuw
0,1,mmasepala fa maemo kgethegileng letlelela kgat...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,uzakwaziswa ngokufaneleko nakungafuneka eminye...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,tshivhumbeo tshi fana na ngano dza vhathu,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,kube inja nelikati betingevakala kutsi titsini...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,winste op buitelandse valuta,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [46]:
# Dropping the message column since it has been vectorized
df_vectorized_clean = df_vectorized_clean.drop("text", axis='columns')
df_vectorized_clean.head()

Unnamed: 0,lang_id_encoded,aan,aan.1,aand,aang,aans,aant,aanv,aba,aba.1,...,ṱo ḓ,ṱo ḓa,ṱuku,ṱuku.1,ṱun,ṱun ḓ,ṱuwe,ṱuwed,ṱuṱu,ṱuṱuw
0,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [47]:
# Dropping the message column since it has been vectorized
df_vectorized_test_clean = df_vectorized_test_clean.drop("text", axis='columns')
df_vectorized_test_clean.head()

Unnamed: 0,index,aan,aan.1,aand,aang,aans,aant,aanv,aba,aba.1,...,ṱo ḓ,ṱo ḓa,ṱuku,ṱuku.1,ṱun,ṱun ḓ,ṱuwe,ṱuwed,ṱuṱu,ṱuṱuw
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [48]:
# Extract index that will be used when submitting predictions based on the evaluate dataset
df_index_submission = pd.DataFrame(df_vectorized_test_clean['index'])
df_index_submission

Unnamed: 0,index
0,1
1,2
2,3
3,4
4,5
...,...
5677,5678
5678,5679
5679,5680
5680,5681


In [49]:
# Remove index form both training and test datasets
df_vectorized_test_clean = df_vectorized_test_clean.drop("index", axis=1)

df_vectorized_test_clean.head()

Unnamed: 0,aan,aan.1,aand,aang,aans,aant,aanv,aba,aba.1,abab,...,ṱo ḓ,ṱo ḓa,ṱuku,ṱuku.1,ṱun,ṱun ḓ,ṱuwe,ṱuwed,ṱuṱu,ṱuṱuw
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# 4. Pre-processing:

In [50]:
# Extracting the response variable from the training dataset
y_train_og = df_vectorized_clean[:len(df_vectorized_clean)][['lang_id_encoded']]

# Separating predictor variables from response variable
x_train = df_vectorized_clean[:len(df_vectorized_clean)].drop("lang_id_encoded", axis = "columns")

In [51]:
# Splitting into our training and validation data subsets
X_train, X_validate, y_train, y_validate = train_test_split(x_train, y_train_og, test_size=0.2, random_state=42, stratify=y_train_og)

In [62]:
# Instantiate the Multinomial Naive Bayes model
mnb_model = MultinomialNB()

# Fit the model on the training data
mnb_model.fit(X_train, y_train)

# Predict on the validation data
y_pred_validate = mnb_model.predict(X_validate)

# Evaluate the MNB model on the validation data
accuracy_mnb = accuracy_score(y_validate, y_pred_validate)
classification_rep_mnb = classification_report(y_validate, y_pred_validate)

# Print the results for the MNB model
print("Multinomial Naive Bayes Model:")
print(f"Accuracy: {accuracy_mnb}")
print("\nClassification Report:\n", classification_rep_mnb)


Multinomial Naive Bayes Model:
Accuracy: 0.9996969696969698

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       600
           1       1.00      1.00      1.00       600
           2       1.00      1.00      1.00       600
           3       1.00      1.00      1.00       600
           4       1.00      1.00      1.00       600
           5       1.00      1.00      1.00       600
           6       1.00      1.00      1.00       600
           7       1.00      1.00      1.00       600
           8       1.00      1.00      1.00       600
           9       1.00      1.00      1.00       600
          10       1.00      1.00      1.00       600

    accuracy                           1.00      6600
   macro avg       1.00      1.00      1.00      6600
weighted avg       1.00      1.00      1.00      6600



In [71]:
# Assuming df_vectorized_test_clean is your test dataset
# Extracting the predictor variables
X_test = df_vectorized_test_clean.copy()  # Create a copy to avoid modifying the original DataFrame

# Check if 'lang_id_encoded' column exists in X_test
#if 'lang_id_encoded' not in X_test.columns:
#raise KeyError("Column 'lang_id_encoded' not found in X_test.")

# Drop 'lang_id_encoded' if it exists
X_test = X_test.drop("lang_id_encoded", axis="columns", errors="ignore")

# Check if 'index' column exists in X_test
#if 'index' not in X_test.columns:
#raise KeyError("Column 'index' not found in X_test.")

# Assuming df_vectorized_test_clean has an 'index' column
# Create a DataFrame with 'index' and 'lang_id_encoded' columns
# Match submissions to index
submission_mnb = pd.concat([df_vectorized_clean.reset_index(drop=True), X_test.reset_index(drop=True)], axis=1)

# Save the DataFrame to a CSV file
submission_mnb.to_csv('submission_mnb.csv', index=False)

# 5. Models:

In [65]:
# Assuming df_vectorized_test_clean is your test dataset
# Extracting the predictor variables
X_test = df_vectorized_test_clean.drop("lang_id_encoded", axis="columns")

# Check if 'lang_id_encoded' column exists in df_vectorized_test_clean
if 'lang_id_encoded' not in df_vectorized_test_clean.columns:
    raise KeyError("Column 'lang_id_encoded' not found in df_vectorized_test_clean.")

# Reshape X_test if needed
# X_test = X_test.values.reshape(-1, 1)

# Predict on the test data
y_pred_test = mnb_model.predict(X_test)

# Assuming df_vectorized_test_clean has an 'index' column
# Create a DataFrame with 'index' and 'lang_id_encoded' columns
submission_mnb = pd.DataFrame({'index': df_vectorized_test_clean['index'], 'lang_id_encoded': y_pred_test})

# Save the DataFrame to a CSV file
submission_mnb.to_csv('submission_mnb.csv', index=False)

KeyError: "['lang_id_encoded'] not found in axis"

In [64]:










submission.to_csv('Predictions_o.csv', index=False)

submission

ValueError: DataFrame constructor not properly called!