Practical 6B: Sentiment Classification

Libraries to import
```Python
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk

nltk.download('movie_reviews')
nltk.download('stopwords')
nltk.download('punkt')
```

**Supervised Classification**   
- classifier is training with **labelled training data**

*Create list of movie review document*
```Python
# Load the movie review dataset
from nltk.corpus import movie_reviews 
reviews = []
for fileid in movie_reviews.fileids():
    sentiment, filename = fileid.split('/')
    reviews.append((filename, movie_reviews.raw(fileid), sentiment))
df = pd.DataFrame(reviews, columns=['filename', 'text', 'sentiment'])
print(df.shape)
display(df.head())
# Plotting the Sentiment distribution
plt.figure()
pd.value_counts(df['sentiment']).plot.bar(title="Sentiment Distribution")
plt.xlabel("Sentiment")
plt.ylabel("No. of rows in df")
plt.show()
```

**Feature Extraction**
- use top-N words feature

*Fetching words from corpus*
```Python
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
all_words = [word.lower() for sent in df.text for word in word_tokenize(sent)
# print first 10 words
print (all_words[:10])
```

*Create frequency distribution of words*: calculate occurences of each word in entire list of words
```Python
from nltk import FreqDist
all_words_frequency = FreqDist(all_words)
print (all_words_frequency)
# print 10 most frequently occurring words
print (all_words_frequency.most_common(10))
```

*Remove Punctuation and Stopwords with GenSim* 
```Python
import string
from nltk.corpus import stopwords
from gensim.parsing.porter import PorterStemmer

porter_stemmer = PorterStemmer()
stopwords_english = stopwords.words('english')
def clean(doc):
    all_words_clean = []
    for word in doc:
        if word not in stopwords_english and not word.isdigit():
            punc_free = ''.join([ch for ch in word if ch not in string.punctuation])
            if len(punc_free)>2 and not word.isdigit():
                all_words_clean.append(porter_stemmer.stem(punc_free))
    return all_words_clean
all_words_clean = clean(all_words)
# print the first 10 words
print (all_words_clean[:10])
```

**Finding frequency distribution of new list after removing stopwords**
```Python
all_words_frequency = FreqDist(all_words_clean)
print (all_words_frequency)

# print 10 most frequently occurring words
print (all_words_frequency.most_common(10))
```
Previously, before removing stopwords and punctuation, the frequency distribution was:

> **FreqDist with 46462 samples and 1525039 outcomes**

Now, the frequency distribution is:

> **FreqDist with 30899 samples and 686219 outcomes**

This shows that after removing around 10000 stop words, numbers and punctuation, the outcomes/words number has reduced to around half of the original size.

The **most common words** or highly occurring words list has also got meaningful words in the list. Before, the first 10 frequently occurring words were only stop-words and punctuations.

### Create Word Feature using 2000 most frequently occurring words

We take 2000 most frequently occurring words as our feature.
```Python
print (len(all_words_frequency)) 
 
# get 2000 frequently occuring words
most_common_words = all_words_frequency.most_common(2000)

# print the first 10 most frequently occuring words
print (most_common_words[:10])

# print the last 10 most frequently occuring words
print (most_common_words[1990:])

# the most common words list's elements are in the form of tuple get 
# only the first element of each tuple of the word list
word_features = [item[0] for item in most_common_words]
print (word_features[:10])

```

**Create Feature Set**
- apply text preprocessing through loops for the reviews
```Python
df['text'] = df['text'].apply(lambda x: word_tokenize(x.lower()))
df['text'] = df['text'].apply(lambda x: clean(x))  
df.head()
```
- create feature set to train classifier: checks if words in given document are present in word_features_list or not
```Python
def document_features(df, stemmed_tokens):
    doc_features = []
    for index, row in df.iterrows():
        features = {}
        for word in word_features:
            # get term occurence: true if it's in the word_features, false if it's not
            features[word] = (word in row[stemmed_tokens])
        doc_features.append(features)
    return doc_features

feature_set = pd.DataFrame(document_features(df, 'text'), index = df.index)
feature_set.head()
```

**Training Classifier**
- create separate train and test set. Use first 400 elemnts of feature as test and the rest as train. generally use 80/20
```Python
import seaborn as sns
from sklearn.model_selection import train_test_split

X = feature_set
y = df[df.columns[-1:]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print (y_train.sentiment.value_counts(normalize=True))

#plot chart
plt.style.use('ggplot')
plt.figure(figsize=(6,4))
sns.countplot(data=y_train, x='sentiment')
```
*Use Decision Tree Classifier to train*
```Python
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

classifier = DecisionTreeClassifier(random_state=42)
classifier.fit(X_train, y_train)

# classification report
print(classification_report(y_test, classifier.predict(X_test)))

# accuracy score
y_pred = classifier.predict(X_test)
print("Accuracy Score: " + str(accuracy_score(y_test, y_pred)))
```
Print Confusion Matrix
```Python
# Function to create a confusion matrix 
def conf_matrix(y_test, pred_test):    
    
    # Creating a confusion matrix
    con_mat = confusion_matrix(y_test, pred_test)
    con_mat = pd.DataFrame(con_mat, range(2), range(2))
   
    #Ploting the confusion matrix
    plt.figure(figsize=(6,6))
    sns.set(font_scale=1.5) 
    sns.heatmap(con_mat, annot=True, annot_kws={"size": 16}, fmt='g', cmap='Blues', cbar=False)
    
#Ploting the confusion matrix
conf_matrix(y_test, y_pred)

**Bag of Words using TF-IDF feature set**
- create dictionary of unique words and calculate term weights for text feature.
```Python
import gensim
from gensim import corpora

# Build the dictionary
mydict = corpora.Dictionary(df['text'])
vocab_len = len(mydict)

def get_bow_features(df, stemmed_tokens):
    test_features = []
    for index, row in df.iterrows():
        # Converting the tokens into the format that the model requires
        features = gensim.matutils.corpus2csc([mydict.doc2bow(row[stemmed_tokens])],num_terms=vocab_len).toarray()[:,0]
        test_features.append(features)
    return test_features

header = ",".join(str(mydict[ele]) for ele in range(vocab_len))

bow_features = pd.DataFrame(get_bow_features(df, 'text'),                            
                            columns=header.split(','), index = df.index)
bow_features.head()

#CREATE TERM WEIGHTS WITH TF-IDF
import gensim
from gensim import corpora
from gensim.models import TfidfModel

# Build the dictionary
mydict = corpora.Dictionary(df['text'])
vocab_len = len(mydict)
corpus = [mydict.doc2bow(line) for line in df['text']]
tfidf_model = TfidfModel(corpus)

def get_tfidf_features(df, stemmed_tokens):
    test_features_tfidf = []
    for index, row in df.iterrows():
        doc = mydict.doc2bow(row[stemmed_tokens])
        # Converting the tokens into the formet that the model requires
        features = gensim.matutils.corpus2csc([tfidf_model[doc]], num_terms=vocab_len).toarray()[:,0]
        test_features_tfidf.append(features)
    return test_features_tfidf

header = ",".join(str(mydict[ele]) for ele in range(vocab_len))

tfidf_features = pd.DataFrame(get_tfidf_features(df, 'text'),                            
                            columns=header.split(','), index = df.index)
tfidf_features.head()
```

**Training Classifier + Accuracy calculation**
```Python
X = tfidf_features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
from sklearn.naive_bayes import GaussianNB
#using decision tree
classifier = DecisionTreeClassifier(random_state=42)
classifier.fit(X_train, y_train)
# classification report
print(classification_report(y_test, classifier.predict(X_test)))
# accuracy score
y_pred = classifier.predict(X_test)
print("Accuracy Score: " + str(accuracy_score(y_test, y_pred)))
```