### Sentiment Analysis of Movie Reviews
https://stackabuse.com/text-classification-with-python-and-scikit-learn/

In [1]:
import numpy as np  
import re  
import nltk  
from sklearn.datasets import load_files  
nltk.download('stopwords')  
import pickle  
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\thoma\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [2]:
movie_data = load_files(r"C:\txt_sentoken")
X, y = movie_data.data, movie_data.target  

In [8]:
#text preprocessing
documents = []

from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

stemmer = WordNetLemmatizer()

for sen in range(0, len(X)):  
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(X[sen]))

    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 

    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)

    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)

    # Converting to Lowercase
    document = document.lower()

    # Lemmatization
    document = document.split()

    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)

    documents.append(document)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\thoma\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


## Converting text to numbers  
Two popular approaches are Bag of Words and the Word Embedding Model  
### Vectorizer parameters  
We set the max_features parameter to 1500, which means that we want to use 1500 most occurring words as features for training our classifier  
min_df is set to 5, which means the minimum number of documents that should contain this feature. So we only include those words that occur in at least 5 documents  
max_df is 0.7, which means that we should include only those words that occur in a maximum of 70% of all the documents. Words that occur in almost every document are usually not suitable for classification because they do not provide any unique information about the document  
Remove stopwords by passing the stop.words object from the nltk.corpus library   

In [9]:
#Bag of words
from sklearn.feature_extraction.text import CountVectorizer  
vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
#The fit_transform function of the CountVectorizer class converts 
#text documents into corresponding numeric features 
X = vectorizer.fit_transform(documents).toarray()

## TFIDF (term frequency * inverse document frequency)
Term frequency = (Number of Occurrences of a word)/(Total words in the document)  
IDF(word) = Log((Total number of documents)/(Number of documents containing the word))  
The TFIDF value for a word in a particular document is higher if the frequency of occurrence of that word is higher in that specific document but lower in all the other documents  

In [None]:
#before converting to Bag of Words
from sklearn.feature_extraction.text import TfidfVectorizer  
tfidfconverter = TfidfVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))  
X = tfidfconverter.fit_transform(documents).toarray()  

In [14]:
#after converting to Bag of Words
from sklearn.feature_extraction.text import TfidfTransformer  
tfidfconverter = TfidfTransformer()  
X = tfidfconverter.fit_transform(X).toarray()  

In [15]:
#split into train and test sets
from sklearn.model_selection import train_test_split  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)  

In [17]:
#random forest
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)  
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [18]:
#use predict method of random forest classifier
y_pred = classifier.predict(X_test)

In [19]:
#evaluate model
#we can use metrics such as the confusion matrix, F1 measure, and the accuracy  
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))  
print(accuracy_score(y_test, y_pred))  

[[180  28]
 [ 30 162]]
              precision    recall  f1-score   support

           0       0.86      0.87      0.86       208
           1       0.85      0.84      0.85       192

   micro avg       0.85      0.85      0.85       400
   macro avg       0.85      0.85      0.85       400
weighted avg       0.85      0.85      0.85       400

0.855


From the output, it can be seen that our model achieved an accuracy of 85.5%, which is very good given the fact that we randomly chose all the parameters for CountVectorizer as well as for our random forest algorithm

### Saving and loading the model
Once the model is trained, we can save it as a pickle object in Python 

In [20]:
#save model
#creates a "text_classifier" file in the working directory  
with open('text_classifier', 'wb') as picklefile:  
    pickle.dump(classifier,picklefile)

In [None]:
#load model
with open('text_classifier', 'rb') as training_model:  
    model = pickle.load(training_model)

In [None]:
#test to see if loaded model works correctly
y_pred2 = model.predict(X_test)

print(confusion_matrix(y_test, y_pred2))  
print(classification_report(y_test, y_pred2))  
print(accuracy_score(y_test, y_pred2))