# Test Your Model

In [1]:
import pickle
import pandas as pd
import re, string, nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report 

def test_trained_model(model_path, test_text):
    saved_model_dic = pickle.load(open(model_path,"rb"))
    saved_clf = saved_model_dic['model']
    saved_vectorizer = saved_model_dic['vectorizer']
    print(len(saved_vectorizer.vocabulary))
    new_test_vecs = saved_vectorizer.fit_transform(test_text)
    return saved_clf.predict(new_test_vecs)

In [2]:
def text_preprocessor(data):
    
    data = [re.sub(re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});'), '', line) for line in data]

    # strip whitespace and converting to lower case
    lower_data = [line.strip().lower() for line in data]
    
    # Replace apostrophes with words
    processed_data = []
    for line in lower_data:
        line = line.replace("-", " ")
        ref_words = [negate_handle[word] if word in negate_handle else word for word in line.split()]
        processed_data.append(" ".join(ref_words))
        
    #remove punctuation
    processed_data = [line.translate(str.maketrans('', '', string.punctuation)) for line in processed_data] 
    
    # removing stopwords
    stops = set(stopwords.words("english"))
    processed_data = [" ".join([word for word in line.split() if word not in stops]) for line in processed_data]
    
    return processed_data

def stem_lemmatize(data):
    # stemming
    # stemmer= PorterStemmer()
    # processed_data = [" ".join([stemmer.stem(word) for word in line.split()]) for line in data]

    # lemmatization
    lemmatizer=WordNetLemmatizer()
    processed_data = [" ".join([lemmatizer.lemmatize(word) for word in line.split()]) for line in data]
    
    return processed_data

In [3]:
# load sample test data

test_data = pd.read_csv('coursework1_train.csv')
if('Unnamed: 0' in test_data.columns):
    del test_data['Unnamed: 0']
    
test_data['sentiment'] = test_data.sentiment.map(lambda x: int(1) if x =='pos' else int(0))

# preprocessing unseen data
with open("sample_trained_model.pickle", "rb") as f:
    saved_file_comp = pickle.load(f)
    
saved_clf = saved_file_comp['model']
negate_handle = saved_file_comp["negate_handle"]
feature_names = saved_file_comp["feature_names"]
processed_data = text_preprocessor(test_data['text'].tolist())
processed_data = stem_lemmatize(processed_data)
test_data['text'] = processed_data
    
test_text = test_data['text'].tolist()[-5000:]
test_labels = test_data['sentiment'].tolist()[-5000:]

print('test data size', len(test_labels))

test data size 5000


In [4]:
# test model
from sklearn.metrics import accuracy_score
new_test_pred = test_trained_model("sample_trained_model.pickle", test_text)
acc = accuracy_score(test_labels, new_test_pred)
print('accuracy', acc)

# confusion matrix
print(confusion_matrix(test_labels, new_test_pred))

# classification_report
print(classification_report(test_labels, new_test_pred))

5000
accuracy 0.9058
[[2237  253]
 [ 218 2292]]
              precision    recall  f1-score   support

           0       0.91      0.90      0.90      2490
           1       0.90      0.91      0.91      2510

    accuracy                           0.91      5000
   macro avg       0.91      0.91      0.91      5000
weighted avg       0.91      0.91      0.91      5000

