#### 1. Loading required Libraries

In [115]:
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

#### 2. Importing dataset into dataframe

In [117]:
# Train Data

train = pd.read_excel("../datasets/training_set.xlsx")
train.head()

Unnamed: 0,title,cleaned text,relevance
0,Blackstone to buy EagleClaw Midstream for abou...,"EagleClaw Midstream Ventures LLC, the largest ...",0
1,Worldwide Mobile Crusher and Screener Industry...,WireThe report has been added to offering. Acc...,0
2,"In a First, BP Offers Employees Shares in Rall...",Oil major BP is launching its first share awar...,1
3,SHAREHOLDER ALERT: Purcell Julie & Lefkowitz L...,TipRanks We’ve got a full month of 2021 behind...,0
4,Komatsu Australia - Komatsu Australia,Komatsu Australia Corporate Finance Pty Ltd Cr...,0


In [118]:
# Test Data

test = pd.read_excel("../datasets/testing_set.xlsx")
test.head()

Unnamed: 0,title,cleaned text,relevance
0,As Blackjewel bankruptcy case enters final str...,On June 11 the court ruled it would lift the a...,0
1,FG Wilson (Engineering) Ltd v John Holt & Comp...,1. This is the hearing of an application by th...,0
2,Personnel Policy and Performance Appraisal Sys...,Here at Komatsu we consider many aspects of ou...,0
3,Caterpillar planning 700 job cuts in the North,US manufacturing firm Caterpillar has announce...,1
4,Hardman & Co Research : Tritax EuroBox present...,Hardman & Co Research 03-Feb-2021 / 12:45 GMT/...,0


#### 3. Functions

In [119]:
import spacy

nlp = spacy.load('en_core_web_sm')

# Text Preprocessing with varoius combination

def spacy_process(text, remove_stopwords, remove_punctuation):
    
    doc = nlp(text)
    
    # Lemmatization
    lemma_list = []
    for token in doc:
        lemma_list.append(token.lemma_)
  
    # Filter the stopword

    if remove_stopwords:
        filtered_sentence =[] 
        for word in lemma_list:
            lexeme = nlp.vocab[word]
            if lexeme.is_stop == False:
                filtered_sentence.append(word)
    else:
        filtered_sentence =  lemma_list
    
    # Remove punctuation
    if remove_punctuation:
        punctuations="?:!.,;$\'-_"
        for word in filtered_sentence:
            if word in punctuations:
                filtered_sentence.remove(word)

    return " ".join(filtered_sentence)

In [120]:
# Converting Text to lowercase

def lower_text(text):
    return text.strip().lower()

In [121]:
# Creating a Dataframe with word-vectors in TF-IDF form and Target values

def final_df(df, to_lower, remove_stopwords, remove_punctuation, is_train, cv):

    # Converting Text to Lowercase
    if to_lower:
        df.iloc[:,0] = df.iloc[:,0].apply(lower_text)

    # Text Preprocessing with combination of stopwords and punctuations removal
    df.iloc[:,0] = df.iloc[:,0].apply(spacy_process, args=(remove_stopwords, remove_punctuation))

    # TF-IDF form
    if is_train:
        x = cv.fit_transform(df.iloc[:,0])
    else:
        x = cv.transform(df.iloc[:,0])

    # TF-IDF form to Dataframe
    temp = pd.DataFrame(x.toarray(), columns=cv.get_feature_names_out())

    # Droping the text column
    df.drop(df.iloc[:,0].name, axis = 1, inplace=True)

    # Returning TF-IDF form with target
    return pd.concat([temp, df], axis=1)

In [122]:
# Training the model with various combination and returns y_test and y_pred

def train_model(model, train, test, to_lower, remove_stopwords, remove_punctuation, cv):

    # Training Preprocessing
    train = final_df(train, to_lower, remove_stopwords, remove_punctuation, True, cv)

    # Spliting training dataset
    X_train = train.iloc[:,:-1]
    y_train = train.iloc[:,-1]

    # Testing Preprocessing
    test = final_df(test, to_lower, remove_stopwords, remove_punctuation, False, cv)

    # Spliting testing dataset
    X_test = test.iloc[:,:-1]
    y_test = test.iloc[:,-1]

    # fitting the model
    model = model.fit(X_train, y_train)

    # calculating y_pred
    y_pred = model.predict(X_test)

    return y_test, y_pred

#### 4. Training and Evaluation

##### Before Training ReadME

In function **train_model**,

The Train and Test Dataset should consist of exactly two column,
1. Text data
2. Target values

Else you would get an Error :p

In [132]:
# Combination 1

# Model                 - Logistic
# Max Features          - 500
# Mono-Gram             - Yes
# Lowercase             - True
# Removed Stopwords     - Yes
# Removed Punctuation   - Yes

model = LogisticRegression()
cv = TfidfVectorizer(ngram_range=(1, 1), max_features = 500)

y_test, y_pred = train_model(model, train.iloc[:,1:], test.iloc[:,1:], to_lower=True, remove_stopwords=True, remove_punctuation=True, cv=cv)

score = accuracy_score(y_test, y_pred)

print("Classification Report")
print(classification_report(y_test, y_pred))

print("Accuracy Score : {:.2f} %".format(score*100))

Classification Report
              precision    recall  f1-score   support

           0       0.86      0.97      0.91       168
           1       0.91      0.67      0.77        79

    accuracy                           0.87       247
   macro avg       0.89      0.82      0.84       247
weighted avg       0.88      0.87      0.87       247

Accuracy Score : 87.45 %


In [124]:
# Combination 2

# Model                 - Random Forest
# Max Features          - 500
# Mono-Gram             - Yes
# Lowercase             - True
# Removed Stopwords     - Yes
# Removed Punctuation   - Yes

model = RandomForestClassifier(n_jobs=3, oob_score=True, n_estimators=100, criterion="gini")
cv = TfidfVectorizer(ngram_range=(1, 1), max_features = 500)

y_test, y_pred = train_model(model, train.iloc[:,1:], test.iloc[:,1:], to_lower=True, remove_stopwords=True, remove_punctuation=True, cv=cv)

score = accuracy_score(y_test, y_pred)

print("Classification Report")
print(classification_report(y_test, y_pred))

print("Accuracy Score : {:.2f} %".format(score*100))

Classification Report
              precision    recall  f1-score   support

           0       0.87      0.98      0.92       168
           1       0.93      0.68      0.79        79

    accuracy                           0.88       247
   macro avg       0.90      0.83      0.85       247
weighted avg       0.89      0.88      0.88       247

Accuracy Score : 88.26 %


#### 5. References

https://docs.google.com/document/d/1GtPdkkekYPDRHfDmqaPDNA0euV1LT63qXtjm6EcPvIk/edit?usp=sharing

https://www.analyticsvidhya.com/blog/2017/06/word-embeddings-count-word2veec/

https://towardsdatascience.com/complete-machine-learning-pipeline-for-nlp-tasks-f39f8b395c0d

https://www.analyticsvidhya.com/blog/2022/06/an-end-to-end-guide-on-nlp-pipeline/

https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

https://saurabhy27.medium.com/tf-idf-monogram-bi-gram-and-tri-gram-and-python-implementation-f343385b62a4

https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

https://www.geeksforgeeks.org/python-lemmatization-with-nltk/

https://www.geeksforgeeks.org/python-program-to-convert-a-list-to-string/

https://stackoverflow.com/questions/12182744/python-pandas-apply-a-function-with-arguments-to-a-series

https://medium.com/mlearning-ai/nlp-03-lemmatization-and-stemming-using-spacy-b2829becceca

https://sparkbyexamples.com/pandas/pandas-drop-multiple-columns-by-index/

https://stackoverflow.com/questions/28103992/tfidf-vectorizer-giving-error

https://github.com/scikit-learn/scikit-learn/issues/19953

https://www.digitalocean.com/community/tutorials/pandas-dataframe-apply-examples

https://www.geeksforgeeks.org/part-speech-tagging-stop-words-using-nltk-python/

https://towardsdatascience.com/text-normalization-with-spacy-and-nltk-1302ff430119

https://github.com/manfye/spacy-nltk-text-normalization/blob/main/spacy-vs-nltk-text-normalization.ipynb

https://spacy.io/usage/processing-pipelines