### 1. SMS Data Exploration

SMS data is available as CSV file along with class material. In the code below, we are copying the from Google drive.

In [None]:
from google.colab import drive
drive.mount('/gdrive')

In [None]:
import pandas as pd
import numpy as np

In [None]:
# read file into pandas using a relative path. Please change the path as needed
sms_df = pd.read_table('/gdrive/My Drive/ML Content/Statistical NLP/Notebooks/data/sms.tsv', header=None, names=['label', 'message'])

In [None]:
#Total number of SMS
sms_df.shape

In [None]:
#Check the contents of dataframe
sms_df.sample(n=15)

In [None]:
sms_df.loc[0, 'message']

In [None]:
#Spam vs ham
sms_df.groupby('label').count()

Null Accuracy - Predicting

In [None]:
4825/5572

In [None]:
#Check out SMS messages which is legitimate - ham
msg_num = np.random.randint(0, sms_df.shape[0])
print(sms_df.loc[msg_num, 'label'], ':', sms_df.loc[msg_num, 'message'])

In [None]:
#Check out SMS messages which is a SPAM
print(sms_df.loc[1734, 'label'], ':', sms_df.loc[1734, 'message'])

In [None]:
#Checkout missing values
sms_df.isnull().sum()

In [None]:
# convert label to a numerical variable
sms_df['label_num'] = sms_df.label.map({'ham':0, 'spam':1})

In [None]:
#We should have label_num column in dataframe
sms_df.sample(n=15)

### 2. Create Training & Test Dataset

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# split X and y into training and testing sets
sms_train, sms_test, y_train, y_test = train_test_split(sms_df.message,
                                                        sms_df.label_num,
                                                        random_state=2)

In [None]:
sms_train.reset_index(inplace=True, drop=True)
sms_test.reset_index(inplace=True, drop=True)
y_train.reset_index(inplace=True, drop=True)
y_test.reset_index(inplace=True, drop=True)

In [None]:
#Traing data
print(sms_train.shape)
print(y_train.shape)

In [None]:
#Test Data
print(sms_test.shape)
print(y_test.shape)

### 3. Tokenization & Vectorization

Using **CountVectorizer**, to get numeric features.

In [None]:
# import and instantiate CountVectorizer (with the default parameters)
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
cvect = CountVectorizer()

In [None]:
sms_train.head()

In [None]:
#Feed SMS data to CountVectorizer
cvect.fit(sms_train)

In [None]:
#Check the vocablury size
len(cvect.vocabulary_)

In [None]:
#What is there in the vocabulary
cvect.vocabulary_

Build Document-term Matrix (DTM)

In [None]:
#Convert Training SMS messages into Count Vectors
X_train_ct = cvect.transform(sms_train)

In [None]:
#Size of Document Term Matrix
X_train_ct.shape

In [None]:
sms_train[0]

In [None]:
#Let's check the first record
X_train_ct[0]

In [None]:
#What's there in sparse matrix
print(X_train_ct[0:1])

In [None]:
#X_train_ct[0:1].todense()

From the [scikit-learn documentation](http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction):

> As most documents will typically use a very small subset of the words used in the corpus, the resulting matrix will have **many feature values that are zeros** (typically more than 99% of them).

> For instance, a collection of 10,000 short text documents (such as emails) will use a vocabulary with a size in the order of 100,000 unique words in total while each document will use 100 to 1000 unique words individually.

> In order to be able to **store such a matrix in memory** but also to **speed up operations**, implementations will typically use a **sparse representation** such as the implementations available in the `scipy.sparse` package.

Convert Test SMS also in numerical features

In [None]:
X_test_ct = cvect.transform(sms_test)

In [None]:
X_test_ct.shape

### 4. Building an SMS Classifier

Let's first try K-Nearest Neigbour algorithm

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# instantiate the model (with the default parameters)
knn = KNeighborsClassifier()

# fit the model with data (occurs in-place)
knn.fit(X_train_ct, y_train)

Evaluation on Test Dataset

In [None]:
from sklearn import metrics

In [None]:
#Calculate accuracy on Test Dataset
metrics.accuracy_score(y_test, knn.predict(X_test_ct))

In [None]:
#Calculate accuracy on Training Dataset
metrics.accuracy_score(y_train, knn.predict(X_train_ct))

We can build Classifier using other algorithms e.g SVM

In [None]:
from sklearn.svm import SVC

In [None]:
#Train an SVM with default parameters
svc = SVC()
svc.fit(X_train_ct, y_train)

In [None]:
#Calculate accuracy on Test Dataset
metrics.accuracy_score(y_test, svc.predict(X_test_ct))

In [None]:
#Calculate accuracy on Train Dataset
metrics.accuracy_score(y_train, svc.predict(X_train_ct))

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_model = RandomForestClassifier()
rf_model.fit(X_train_ct, y_train)

In [None]:
#Calculate accuracy on Test Dataset
metrics.accuracy_score(y_test, rf_model.predict(X_test_ct))

In [None]:
#Calculate accuracy on Train Dataset
metrics.accuracy_score(y_train, rf_model.predict(X_train_ct))

### 5. Using TF-IDF Vectorizer

In [None]:
# import and instantiate TF-IDF Vectorizer (with the default parameters)
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tvect = TfidfVectorizer()

In [None]:
#Feed SMS data to CountVectorizer
tvect.fit(sms_train)

#Check the vocablury size
len(tvect.vocabulary_)

In [None]:
#Convert Training SMS messages into numerical values
X_train_tfidf = tvect.transform(sms_train)

X_train_tfidf.shape

In [None]:
#Check first example
print(X_train_tfidf[0])

In [None]:
#Convert Test SMSes also to tf-idf vectors
X_test_tfidf = tvect.transform(sms_test)

Build an Random Forest

In [None]:
rf_model_tf = RandomForestClassifier()
rf_model_tf.fit(X_train_tfidf, y_train)

In [None]:
#Calculate accuracy on Test Dataset
metrics.accuracy_score(y_test, rf_model_tf.predict(X_test_tfidf))

### 6. TF-IDF with ngram

In [None]:
#Use ngrams of length upto 2 words
tvect_ngram = TfidfVectorizer(ngram_range=(1,2)) #Tokens can be made of 1 word or 2 words

In [None]:
#sms_df.head()

In [None]:
#Feed SMS data to CountVectorizer
tvect_ngram.fit(sms_df.message)

#Check the vocablury size
len(tvect_ngram.vocabulary_)

The movie was awesome

Words as tokens = "The", "movie", "was", awesome"

ngrams (1,2) -> "The", "movie", "was", awesome", "The movie", "movie was", "was awesome"

In [None]:
tvect_ngram.vocabulary_

In [None]:
#Convert Training SMS messages into numerical values
X_train_tfidf_ngram = tvect_ngram.transform(sms_train)

X_train_tfidf_ngram.shape

In [None]:
rf_model_tf_ngram = RandomForestClassifier()
rf_model_tf_ngram.fit(X_train_tfidf_ngram, y_train)

In [None]:
#Calculate accuracy on Test Dataset
metrics.accuracy_score(y_test, rf_model_tf_ngram.predict(tvect_ngram.transform(sms_test)))

In [None]:
#Calculate accuracy on Train Dataset
metrics.accuracy_score(y_train, rf_model_tf_ngram.predict(X_train_tfidf_ngram))

**Summary:**

- `vect.fit(train)` **learns the vocabulary** of the training data
- `vect.transform(train)` uses the **fitted vocabulary** to build a document-term matrix from the training data
- `vect.transform(test)` uses the **fitted vocabulary** to build a document-term matrix from the testing data and **ignores tokens** it hasn't seen before

### 7. Building a Deep Learning Model

In [None]:
import tensorflow as tf

We will use CountVectorizer features in this case. This can be replaced by TF-IDF features

In [None]:
#Start building a Keras Sequential Model
tf.keras.backend.clear_session()
model = tf.keras.Sequential()

In [None]:
#Add hidden layers
model.add(tf.keras.layers.Dense(100, activation='relu', input_shape=(len(tvect.vocabulary_),)))
model.add(tf.keras.layers.Dropout(0.4))
model.add(tf.keras.layers.Dense(50, activation='relu'))
model.add(tf.keras.layers.Dropout(0.4))

#Add Output layer
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

In [None]:
#Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
X_train_ct.shape

In [None]:
model.fit(X_train_ct.todense(), y_train,
           validation_data=(X_test_ct.todense(), y_test),
           epochs=10, batch_size=32)

In [None]:
X_train_ct.todense()[0]

In [None]:
print(X_train_ct[0])

In [None]:
#Start building a Keras Sequential Model
tf.keras.backend.clear_session()
model = tf.keras.Sequential()

#Add hidden layers
model.add(tf.keras.layers.Reshape((7450,1), input_shape=(len(tvect.vocabulary_),)))
model.add(tf.keras.layers.Conv1D(100, kernel_size=(3),activation='relu'))
model.add(tf.keras.layers.Dropout(0.4))
model.add(tf.keras.layers.Conv1D(50, kernel_size=(3), activation='relu'))
model.add(tf.keras.layers.Dropout(0.4))


#Add Output layer
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

#Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(X_train_ct.todense(), y_train,
           validation_data=(X_test_ct.todense(), y_test),
           epochs=10,
           batch_size=32)

In [None]:
model.summary()

### 8. Controlling Vocabulary size

Thus far, we have been using the default parameters of [CountVectorizer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html):

In [None]:
cvect.vocabulary_

In [None]:
# show default parameters for CountVectorizer (TFIDF will have similar parameters)
?cvect

However, the vectorizer is worth tuning, just like a model is worth tuning! Here are a few parameters that you might want to tune:

- **stop_words:** string {'english'}, list, or None (default)
    - If 'english', a built-in stop word list for English is used.
    - If a list, that list is assumed to contain stop words, all of which will be removed from the resulting tokens.
    - If None, no stop words will be used.

In [None]:
# remove English stop words
vect = CountVectorizer(stop_words='english')
vect.fit(sms_train)
len(vect.get_feature_names_out())

- **ngram_range:** tuple (min_n, max_n), default=(1, 1)
    - The lower and upper boundary of the range of n-values for different n-grams to be extracted.
    - All values of n such that min_n <= n <= max_n will be used.

In [None]:
# include 1-grams, 2-grams and 3-grams
vect = CountVectorizer(ngram_range=(1, 3))
vect.fit(sms_train)
len(vect.get_feature_names())

- **max_df:** float in range [0.0, 1.0] or int, default=1.0
    - When building the vocabulary, ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words).
    - If float, the parameter represents a proportion of documents.
    - If integer, the parameter represents an absolute count.

In [None]:
# ignore terms that appear in more than 50% of the documents
vect = CountVectorizer(max_df=0.5)
vect.fit(sms_train)
len(vect.get_feature_names_out())

- **min_df:** int, default=1


> Defines, at a minimum, how many documents a word should appear before it is included in Vocablury


In [None]:
# only keep terms that appear in at least 2 documents
vect = CountVectorizer(min_df=2)
vect.fit(sms_train)
len(vect.get_feature_names_out())

In [None]:
vect.vocabulary_

- **max_features**: int or None, default=None


> Maximum size of vocabulary. None means no hard limit.




In [None]:
# only keep terms that appear in at least 2 documents, but maximum vocablury is restricted to 2000 words
vect = CountVectorizer(min_df=2, max_features=2000)
vect.fit(sms_train)
len(vect.get_feature_names_out())

**Guidelines for tuning Vectorizer:**

- Use your knowledge of the **problem** and the **text**
- **Experiment**, and let the data tell you the best approach!
- Quiet often, number of features are limited by amount of RAM/Compute available.

### Word Cloud

In [None]:
import matplotlib.pyplot as plt # visualization
from wordcloud import WordCloud

In [None]:
# Define wordcloud function from wordcloud library.
wc = WordCloud()
wc.generate(str(sms_df['message']))
# declare our figure
plt.figure(figsize=(20,10), facecolor='k')
# add title to the graph
plt.title("Most frequent words in SMS dataset", fontsize=40, color='white')
plt.imshow(wc)
plt.show()