In [1]:
import unicodecsv                               # csv reader
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_extraction.text import TfidfTransformer

import pandas as pd
import re
from nltk import pos_tag


from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from matplotlib import pyplot as plt
import numpy as np

import csv                             
import nltk
nltk.data.path.append("/Users/Shared/nltk_data")

In [2]:
def loadData(path, Text=None):
    with open(path, 'r',encoding='utf-8') as f:
        reader = csv.reader(f,delimiter=',')
        for line in reader:
            (Lines, Character, Gender) = parseReview(line)
            rawData.append((Lines,Character, Gender))

def splitData(percentage):     # A method to split the data between trainData and testData 
    dataSamples = len(rawData)
    halfOfData = int(len(rawData)/2)
    trainingSamples = int((percentage*dataSamples)/2)
    for (Lines, _, Gender) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
        trainData.append((toFeatureVector(preProcess(Lines)),Gender))
    for (Lines, _, Gender) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
        testData.append((toFeatureVector(preProcess(Lines)),Gender))

In [3]:
def parseReview(reviewLine):
    
    Lines = reviewLine[0]
    Character = reviewLine[1]
    Gender = reviewLine[2]
    
    return reviewLine[0], reviewLine[1], reviewLine[2] 

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer
    
# Text processing with Scikit-Learn, basics
# Creating a vectorizer that can be used to extract a bag of words
# representation from documents


stop_words = set(stopwords.words('english'))
wordnet_lemmatizer = WordNetLemmatizer()


stemmer = SnowballStemmer("english")
stemmer = PorterStemmer()
# Input: a string of one review
def preProcess(text):


    pos_tags = [x[1] for x in pos_tag(text)] 
    text = word_tokenize(text)
    b = []
    for word in text:
        if word.isalpha(): # removing punctuation
            if word not in stop_words: # removing stopwords or "too common" words
                word = word.lower() # converting all letters to lower case 
                word = wordnet_lemmatizer.lemmatize(word)  # lemmatisation
                word = stemmer.stem(word) # Using standard stemmer from the nltk
                b.append(word)
     
    return b,pos_tags #returns both pos-tags and a list of words that have been pre-processed

In [5]:
featureDict = {} # A global dictionary of features
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=1)

def toFeatureVector(put):
    tokens,pos_tags = put
# Should return a dictionary containing features as keys, and weights as values
    featureVector = {}
    for token in tokens: #split words into tokens and pos
        if token not in featureVector:
            featureVector[token] = 1.0
        else:
            featureVector[token] = float(featureVector[token] + 1)
            
        if token not in featureDict:
            featureDict[token] = 1.0
        else:
            featureDict[token] = float(featureDict[token] + 1)

        
    for pos in pos_tags: #split words into tokens and pos
        if pos not in featureVector:
            featureVector[pos] = 1.0
        else:
            featureVector[pos] = float(featureVector[pos] + 1)
            
        if pos not in featureDict:
            featureDict[pos] = 1.0
        else:
            featureDict[pos] = float(featureDict[pos] + 1)    
            
    
    for i in range(1, len(tokens)):
            bigram = tokens[i-1] + " " + tokens[i]
            try:
                featureVector[bigram] = 1 #+= 1.0/len(tokens)
            except KeyError:
                featureVector[bigram] = 1 #= 1.0/len(tokens)
            try:
                featureVector[bigram] += 1.0
            except KeyError:
                featureVector[bigram] = 1.0
                
    
    sentence = len(tokens)
    featureDict['length']= sentence
        
    return featureVector

In [6]:
# TRAINING AND VALIDATING OUR CLASSIFIER
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()

def trainClassifier(trainData):
    print("Training Classifier...")
    pipeline =  Pipeline([('tfidf', TfidfTransformer()),('chi2', SelectKBest(chi2, k=15000)),('svc', LinearSVC(loss = 'hinge'))])
    return SklearnClassifier(pipeline).train(trainData)

In [7]:
# PREDICTING LABELS GIVEN A CLASSIFIER

def predictLabels(reviewSamples, classifier):
    return classifier.classify_many(map(lambda t: t[0], reviewSamples))

def predictLabel(text, classifier):
    return classifier.classify(toFeatureVector(preProcess(text)))


In [8]:
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support

# loading reviews
rawData = []         
trainData = []        
testData = []         


# references to the data files
Path = 'training.csv'

loadData(Path) 

splitData(0.8)


In [9]:
# Finally, check the accuracy of your classifier by training on all the tranin data
# and testing on the test set
# Will only work once all functions are complete
functions_complete = True  # set to True once you're happy with your methods for cross val
if functions_complete:
    classifier = trainClassifier(trainData)  # train the classifier
    testTrue = [t[1] for t in testData]   # get the ground-truth labels from the data
    testPred = predictLabels(testData, classifier)  # classify the test data to get predicted labels
    print("Done training!")
    print(classification_report(testTrue, testPred))


Training Classifier...
Done training!
              precision    recall  f1-score   support

      female       0.56      0.48      0.52      1017
        male       0.54      0.63      0.58      1006

    accuracy                           0.55      2023
   macro avg       0.55      0.55      0.55      2023
weighted avg       0.55      0.55      0.55      2023



In [10]:
# # references to the data files
# rawData = []         
# trainData = []        
# testData = []   


# Path = 'test.csv'
# loadData(Path) 


# splitData(0)

# functions_complete = True  # set to True once you're happy with your methods for cross val
# if functions_complete:
#     testTrue = [t[1] for t in testData]   # get the ground-truth labels from the data
#     testPred = predictLabels(testData, classifier)  # classify the test data to get predicted labels
#     print("Done training!")
#     print(classification_report(testTrue, testPred))


In [11]:
# def confusion_matrix_heatmap(testTrue, testPred):
#     """Function to plot a confusion matrix"""
#     labels = list(set(testPred))   # get the labels in the y_test
#     # print(labels)
#     cm = confusion_matrix(testTrue, testPred, labels)
#     fig = plt.figure(figsize=(10,10))
#     ax = fig.add_subplot(111)
#     cax = ax.matshow(cm)
#     plt.title('Confusion matrix of the classifier')
#     fig.colorbar(cax)
#     ax.set_xticks(np.arange(len(labels)))
#     ax.set_yticks(np.arange(len(labels)))
#     ax.set_xticklabels( labels, rotation=0)
#     ax.set_yticklabels( labels)

#     for i in range(len(cm)):
#         for j in range(len(cm)):
#             text = ax.text(j, i, cm[i, j],
#                            ha="center", va="center", color="w")

#     plt.xlabel('Predicted')
#     plt.ylabel('True')
#     #fig.tight_layout()
#     plt.show()

In [12]:
# confusion_matrix_heatmap(testTrue, testPred)