In [87]:
import glob
import sys
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.linear_model import LogisticRegression as LR
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import *
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

In [100]:
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/shikha/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [90]:
path = '/Users/shikha/Documents/projects/omdena/dataset/example/*'
files = glob.glob(path)
data = {}
# label 0 means no ptsd
data[0] = []
# label 1 means ptsd
data[1] = []
count = 0
separator = ""
for name in files: # 'file' is a builtin type, 'name' is a less-ambiguous variable name.
    try:
        with open(name) as f:
            content = f.readlines()
            content = separator.join(content)
            count += 1
            if count % 2 == 0:
                data[0].append(content)
            else:
                data[1].append(content)
    except IOError as exc:
        print "IO Error"

print "Total Clients processed: ", count

Total Clients processed:  3


In [91]:
print "Count of no ptsd clients: ", len(data[0])
print "Count of ptsd clients: ", len(data[1])

Count of no ptsd clients:  1
Count of ptsd clients:  2


In [None]:
def load_dataset(filename=""):
    X = []
    Y = []
    for label, transcripts in data.items():
        X += [val for val in transcripts]
        Y += [label] * len(transcripts)
    X = np.array(X)
    Y = np.array(Y)
    
    return X, Y

In [97]:
def tokenize(doc, decode=True):
#     sent_tokenizer = nltk.sent_tokenize()
#     word_tokenizer = nltk.word_tokenize()
    if decode:
        return word_tokenize(doc.decode('utf-8'))
    else:
        return word_tokenize(doc)

class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = nltk.stem.WordNetLemmatizer()
        
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in tokenize(doc, decode=False)]

class WordFeature(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""

    def fit(self, x, y=None):
        return self

    def transform(self, posts):
        res = []
        for sentence in posts:
            feats = {}
            words = tokenize(sentence)
            countExclamation = 0
            i = 0
            length = len(words)
            while i < length:
                w = words[i]
#                 countExclamation += word.count('!')
                i += 1
            for j, word in enumerate(words):
                countExclamation += word.count('!')

            feats['length'] = len(sentence)
            feats['exclamation'] = countExclamation
            res.append(feats)
        return res

class PosTags(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""

    def fit(self, x, y=None):
        return self

    def transform(self, posts):
        res = []
        for sentence in posts:
            words = tokenize(sentence)
            tags = nltk.pos_tag(words)
            feats = {}
            countE = 0
            for w,t in tags:
                feats[w] = t
                # countE += w.count('!')
            # feats['length'] = len(sentence)
            # feats['exclamation'] = countE
            res.append(feats)
        # print "--------------------------------------------------"
        # print res[0]
        return res

In [98]:
def init():
    clf = Pipeline([

        ('union', FeatureUnion(
            transformer_list=[
                ('vectorizer', CountVectorizer(ngram_range=(1, 2), tokenizer=LemmaTokenizer())),
                # ('vectorizer', Pipeline([
                #     ('count', CountVectorizer(ngram_range=(1, 2))),
                #     ('tfidf', TfidfTransformer())
                # ])),
        
                ('body_pos', Pipeline([
                    ('pos', PosTags()),  # returns a list of dicts
                    ('vect', DictVectorizer())  # list of dicts -> feature matrix
                ])),
                ('body_stats', Pipeline([
                    ('word_stats', WordFeature()),
                    ('vect', DictVectorizer())  # list of dicts -> feature matrix
                ]))
            ],
            # weight components in FeatureUnion
            transformer_weights={
                'vectorizer': 1.0,
                'body_stats': 1.0,
            },
        )),

        ('classifier', LR())
    ])
    return clf

def train(clf, X_train, Y_train):
    clf.fit(X_train, Y_train)

def predict(clf, X_test, Y_test):
    y_pred = clf.predict(X_test)
    return y_pred

def train_cv(clf, X, Y):
    k_fold = KFold(n_splits=5, shuffle=True, random_state=3)
    split = k_fold.split(X)
    scores = []
    confusion = np.array([[0, 0], [0, 0]])
    for train_indices, test_indices in split:
        X_train = X[train_indices]
        Y_train = Y[train_indices]

        X_val = X[test_indices]
        Y_val = Y[test_indices]

        train(clf, X_train, Y_train)
        predictions = predict(clf, X_val, Y_val)

        confusion += confusion_matrix(Y_val, predictions)
        score = f1_score(Y_val, predictions)
        scores.append(score)
    return confusion, scores


In [102]:
X, Y = load_dataset()
# train_len = int(0.8 * X.shape[0])
# X_train = X[0:train_len]
# Y_train = Y[0:train_len]
# X_test = X[train_len:]
# Y_test = Y[train_len:]

clf = init()

# confusion, scores = train_cv(clf, X_train, Y_train)
# print "Total tweets classified:", X.shape[0]
# print "F1 Score:", sum(scores)/len(scores)
# print "Confusion matrix:"
# print confusion

# testing with example data; remove later
train(clf, X, Y)
y_pred = predict(clf, X, Y)
print y_pred


tokenize
tokenize
tokenize
tokenize
tokenize
tokenize
tokenize
tokenize
tokenize
tokenize
tokenize
tokenize
tokenize
tokenize
tokenize
tokenize
tokenize
tokenize
[0 1 1]
