In [None]:
# Text Classification
# -------------------
# Definition:
#   Text Classification is the processing of labeling or organizing text data into groups.
#   It forms a fundamental part of Natural Language Processing. In the digital age that we
#   live in we are surrounded by text on our social media accounts, in commercials, on websites, Ebooks, etc. 
#   Taken from: https://www.educative.io/edpresso/text-classification-in-nlp

# Text Classification features
# 1. Your model can onky have two results: Either your model is correct or incorrect
# 2. Raw text is converted into a vectorized version (ML doesn't really understand text).
#    Imagine that a text has a vectorized form (like a matrix). There are many ways of transforming
#    raw text into numerical information. 

# Some Applications of Text Classification:
#   - Spam detection
#   - Bot comments
#   - Topic Labeling
#   - Sentiment Analysis

# Notes:
# - Text classification is a topic of ML
# - ML uses algorithms that iteratively learn from data
# - A text classification system can be built based on Supervided Machine Learning



![](images/supervised_ml.png)
Image taken from: https://towardsdatascience.com/what-is-machine-learning-a-short-note-on-supervised-unsupervised-semi-supervised-and-aed1573ae9bb

In [3]:
# In a ML algotihm model, how do we evaluate the performance of a model?
# ----------------------------------------------------------------------
# To do so, we have to know some Clasification Metrics:
# - Accuracy: 
#    - Number of correct predictions made by the model divided by the total number of predictions
#    - Accuracy is useful when target classes are well balanced (e.g. there are more or less the same amount of positive and negatives comments)
            
# - Recall:
#    - Ability of a model to find all the relevant cases within a dataset
#    - The # of true positives divided by the # of true positives plus the # of false negatives.
# - Precision:
#    - Precision is the ratio between the True Positives and all the Positives
#    - The # of true positives divided by the number of true positives plus the # of false positives
# - F1-Score:
#    - Combination of Recall & Precision

# Resources:
# - https://www.analyticsvidhya.com/blog/2020/09/precision-recall-machine-learning/


In [2]:
# Confusion Matrix: A table to describe the performance of a classification model
# ------------------------------------------------------------------------------
# Taking as example the Spam Email Detector:
# - A text message is Spam (True condition)
# - ML model predicted Spam (Predicted condition)

# If our model can only output two possible values (correct of incorrect), a True condition could be either correct or incorrect
# and a predicted condition could be correct or incorrect. In other words, we have 4 separate groups ad the end of testing
# Examples:

# A legitimate message was identified as legitimate (Positive Condition, Predicted Positive) -> True Positive
# A spam message was identified as spam (Negative Condition, Predicted Negative) -> True Negative
# A legitimate message was identified as spam (Positive Condition, Predited Negative) -> False Negative
# A spam message was identified as legitimate (Negative Condition, Predicted Positive) -> False Positive

![](images/confusion_matrix.png)

# Confusion Matrix Example: Diagnostic Tool
![](images/confusion_matrix_example.png)


In [4]:
# Text Feature Extraction
# -----------------------

# - Most classic ML algorithm can't take in raw text
# - We need some kind of feature that allow us to take text and convert it in numerical information

In [None]:
import numpy as np
import pandas as pd

from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

In [None]:
current_path = str(Path('.').absolute())
data_path = str(current_path) + '/data/smsspamcollection.tsv'
df = pd.read_csv(data_path, sep='\t')
df.head()

In [None]:
df.isnull().sum()

In [None]:
df['label'].value_counts()

In [None]:
# Let's vectorize the message attribute
X = df['message']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)
# Perform Count Vectorization
count_vect = CountVectorizer()

# What steps do we need to do with the CountVectorizer?
count_vect.fit(X_train) # -> building the vocabulary, counting the number of words
X_train_counts = count_vect.transform(X_train) # -> Do the text to numeric transformation

# X_train_counts = count_vect.fit_transform(X_train)

In [None]:
print(X_train_counts.shape)
print(X_train.shape)

In [None]:
# Things to be aware of
# - Not all words are very important
# - A bunch of word does not "weight" the same as others
# - TD-IDF give us information about which words are mort importants than others

from sklearn.feature_extraction.text import TfidfTransformer

tdidf_transf = TfidfTransformer()
X_train_tfidf = tdidf_transf.fit_transform(X_train_counts)
X_train_tfidf.shape

# from sklearn.feature_extraction.text import TfidfVectorizer
# vect = TfidfVectorizer()
# X_train_tfidf = vect.fit_transform(X_train)

In [None]:
# Training a Classifier

from sklearn.svm import LinearSVC

clf = LinearSVC()
clf.fit(X_train_tfidf, y_train)

In [None]:
# sklearn Pipeline object to reduce reptitive process
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

# With the pipeline object we will perform TF-IDF, then Classification in a single step
text_clf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LinearSVC())
])
text_clf.fit(X_train, y_train)

In [None]:
predictions = text_clf.predict(X_test)

In [None]:
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions))

In [None]:
text_clf.predict(['Heyyyy how areee you? I just wanna get in touch with you for playing a song.'])
text_clf.predict(['Congratulations! You have been selected as a winner. Please click in the following link and send a text message to 2452435 number'])