## **Text Feature Engineering and Modelling**
Done by Wong Wen Bing 230436M

In [2]:
# pip install scikit-learn seaborn 

In [3]:
import nltk
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [4]:
df=pd.read_csv('230436M_cleaned_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,description,species,text_length,cleaned_text,chunked
0,0,2 Jun 2023 ï¿½ The Javan myna shares some simi...,Javan Myna,162,jun javan myna share similarity common myna te...,jun javan_myna share similarity common myna te...
1,1,Click here for more information about the Red ...,Collared Kingfisher,398,click information red list category criterion ...,click information red list category criterion ...
2,2,The black-headed oriole ( Oriolus larvatus) is...,Black-naped Oriole,349,black headed oriole oriolus larvatus specie bi...,black_headed_oriole oriolus larvatus specie bi...
3,3,"Search from thousands of royalty-free ""Javan M...",Javan Myna,177,search thousand royalty free javan myna stock ...,search thousand royalty free javan_myna stock ...
4,4,521 foreground recordings and 156 background ...,Little Egret,112,foreground recording background recording egre...,foreground recording background recording egre...


#### **Feature Engineering**   
There are different types of features that can be done, we will be using a mix of vectorizers as comparison. A base model will be trained to examine this.




In [6]:
X = df['cleaned_text']
Y=df['species']

Setting test size and training size

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test=train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y)

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(stop_words='english', ngram_range=(1,2))
X_train_cv=cv.fit_transform(X_train)
X_test_cv=cv.transform(X_test) #make sure X_test is TRANSFORM and not fit_transform

#print dimensions and features 
print(f"""
training: {X_train_cv.toarray().shape}
test: {X_test_cv.toarray().shape}

features: {cv.get_feature_names_out()}
      """)


training: (488, 9684)
test: (123, 9684)

features: ['aa' 'aa sitesettingsid' 'ability' ... 'zimbabwe zambia' 'zoonosis'
 'zoonosis singapore']
      


In [10]:
import pickle
with open('Count Vectorizer.pkl', 'wb') as f1: 
    pickle.dump(cv, f1)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv=TfidfVectorizer(stop_words='english', ngram_range=(1,2))
X_train_tv=tv.fit_transform(X_train)
X_test_tv=tv.transform(X_test) #make sure X_test is TRANSFORM and not fit_transform

#print dimensions and features 
print(f"""
training: {X_train_tv.toarray().shape}
test: {X_test_tv.toarray().shape}

features: {tv.get_feature_names_out()}
      """)


training: (488, 9684)
test: (123, 9684)

features: ['aa' 'aa sitesettingsid' 'ability' ... 'zimbabwe zambia' 'zoonosis'
 'zoonosis singapore']
      


##### **Usage of A Base Model for testing**
We will use a base model, Logistic Regression to decide which of the vectorizers or bag of words features can produce the most results. T

In [13]:
from sklearn.linear_model import LogisticRegression


#train
def vectorise_data(X_train_cv,X_test_cv):
    lr=LogisticRegression(solver='lbfgs')
    lr.fit(X_train_cv, y_train)
    y_pred_cv=lr.predict(X_test_cv)
    accuracy=accuracy_score(y_test, y_pred_cv)
    return accuracy

print(f'''
Model: Count Vectorizer
Accuracy: {vectorise_data(X_train_cv, X_test_cv)} 
=================================================================================================

Model: Tfidf Vectorizer
Accuracy: {vectorise_data(X_train_tv, X_test_cv)}
''')



Model: Count Vectorizer
Accuracy: 0.9349593495934959 

Model: Tfidf Vectorizer
Accuracy: 0.9349593495934959



#### **Modelling**
There will be a number of models tested.

1. Logistic Regression
2. Naives Bayes
3. Support Vector Machine

In [15]:
#importing different models 
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB

#accuracy metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


#list of models
models=[
    {'name': 'Logistic Regression', 
     'model': LogisticRegression(solver='lbfgs')}, 
    {'name': 'Naive Bayes (Multinomial)', 
     'model': MultinomialNB()}, 
    {'name': 'Naive Bayes (GaussianNB)', 
     'model': GaussianNB()}, 
]
results=[]

#printing confusion matrix    
def conf_matrix(y_test, pred_test):    
    # Creating a confusion matrix
    con_mat = confusion_matrix(y_test, pred_test)
    con_mat = pd.DataFrame(con_mat, range(4), range(4))
    #Ploting the confusion matrix
    plt.figure(figsize=(6,6))
    sns.set(font_scale=1.5) 
    sns.heatmap(con_mat, annot=True, annot_kws={"size": 16}, fmt='g', cmap='Blues', cbar=False)
    

In [16]:
#run models and print classificaton report
from sklearn.metrics import accuracy_score
for modell in models:
    model=modell['model']
    model.fit(X_train_cv, y_train)
    y_pred_cv=model.predict(X_test_cv)
    accuracy=accuracy_score(y_test, y_pred_cv)
    # confusionmatrix = conf_matrix(y_test, y_pred_cv)
    print(f'''
    Model Name: {modell['name']}
    Accuracy: {accuracy:.2f}
    Confusion Matrix: 
    ======================================================================================================
    ''')
    


    Model Name: Logistic Regression
    Accuracy: 0.93
    Confusion Matrix: 
    

    Model Name: Naive Bayes (Multinomial)
    Accuracy: 0.93
    Confusion Matrix: 
    


TypeError: Sparse data was passed for X, but dense data is required. Use '.toarray()' to convert to a dense numpy array.

### **Model 1: Logistic Regression**

In [61]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
lr=OneVsRestClassifier(LogisticRegression(solver='lbfgs'))

#train
lr.fit(X_train_cv, y_train)

#apply to test
y_pred_cv=lr.predict(X_test_cv)
y_pred_cv

array(['Collared Kingfisher', 'Javan Myna', 'Little Egret', 'Javan Myna',
       'Black-naped Oriole', 'Javan Myna', 'Collared Kingfisher',
       'Javan Myna', 'Javan Myna', 'Black-naped Oriole',
       'Black-naped Oriole', 'Black-naped Oriole', 'Collared Kingfisher',
       'Black-naped Oriole', 'Javan Myna', 'Javan Myna',
       'Collared Kingfisher', 'Black-naped Oriole', 'Little Egret',
       'Javan Myna', 'Little Egret', 'Little Egret', 'Little Egret',
       'Javan Myna', 'Collared Kingfisher', 'Collared Kingfisher',
       'Collared Kingfisher', 'Black-naped Oriole', 'Collared Kingfisher',
       'Black-naped Oriole', 'Collared Kingfisher', 'Javan Myna',
       'Little Egret', 'Little Egret', 'Black-naped Oriole',
       'Little Egret', 'Little Egret', 'Black-naped Oriole',
       'Black-naped Oriole', 'Black-naped Oriole', 'Javan Myna',
       'Little Egret', 'Little Egret', 'Black-naped Oriole',
       'Black-naped Oriole', 'Little Egret', 'Little Egret', 'Javan Myna',
    

In [63]:
from sklearn.preprocessing import LabelBinarizer

label_binarizer = LabelBinarizer().fit(y_train)
y_onehot_test = label_binarizer.transform(y_test)
y_onehot_test.shape  # (n_samples, n_classes)

(123, 4)

In [65]:
import matplotlib.pyplot as plt

from sklearn.metrics import RocCurveDisplay
class_of_interest="Little Egret"
class_id = np.flatnonzero(label_binarizer.classes_ == class_of_interest)[0]

display = RocCurveDisplay.from_predictions(
    y_onehot_test[:, class_id],
    y_pred_cv[:, class_id],
    name=f"{class_of_interest} vs the rest",
    color="darkorange",
    plot_chance_level=True,
)
_ = display.ax_.set(
    xlabel="False Positive Rate",
    ylabel="True Positive Rate",
    title=f"One-vs-Rest ROC curves:\{class_of_interest} vs (all)",
)

IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed

In [67]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test, y_pred_cv)
cm

array([[36,  0,  0,  3],
       [ 1, 26,  0,  1],
       [ 0,  0, 30,  2],
       [ 1,  0,  0, 23]])

In [75]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_cv, target_names=['Black-naped Oriole','Collared Kingfisher','Javan Myna','Little Egret']))

                     precision    recall  f1-score   support

 Black-naped Oriole       0.95      0.92      0.94        39
Collared Kingfisher       1.00      0.93      0.96        28
         Javan Myna       1.00      0.94      0.97        32
       Little Egret       0.79      0.96      0.87        24

           accuracy                           0.93       123
          macro avg       0.94      0.94      0.93       123
       weighted avg       0.94      0.93      0.94       123



Saving the model

In [71]:
import pickle
with open(f'Logistic Regression.pkl', 'wb') as f1: 
    pickle.dump(lr, f1)

### **Model 2: Naives Bayes**

In [None]:
#MAIN DIFFERENCE IN CODE
from sklearn.naive_bayes import MultinomialNB

nb=MultinomialNB()

#train
nb.fit(X_train_cv, y_train)

#move to X_test
y_pred_cv=nb.predict(X_test_cv)
# y_pred_cv

In [None]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test, y_pred_cv)
cm

In [None]:
# Function to create a confusion matrix 
def conf_matrix(y_test, pred_test):    
    
    # Creating a confusion matrix
    con_mat = confusion_matrix(y_test, pred_test)
    con_mat = pd.DataFrame(con_mat, range(4), range(4))
   
    #Ploting the confusion matrix
    plt.figure(figsize=(6,6))
    sns.set(font_scale=1.5) 
    sns.heatmap(con_mat, annot=True, annot_kws={"size": 16}, fmt='g', cmap='Blues', cbar=False)
    
#Ploting the confusion matrix
conf_matrix(y_test, y_pred_cv)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_cv, target_names=['Black-naped Oriole','Collared Kingfisher','Javan Myna','Little Egret']))

Support Vector Machine


In [None]:
from sklearn import svm

supportvector=svm.LinearSVC()
#train
supportvector.fit(X_train_cv, y_train)

#move to X_test
y_pred_cv=supportvector.predict(X_test_cv)

In [None]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test, y_pred_cv)
cm

In [None]:
from sklearn.metrics import classification_report
svm_results = classification_report(y_test, y_pred_cv, target_names=['Black-naped Oriole','Collared Kingfisher','Javan Myna','Little Egret'])