In [1]:
import string
from collections import defaultdict
import multiprocessing as mp
from nltk import word_tokenize
import re
import pickle
import nltk

liwcPath = 'liwc/liwc-english-mod.dic'

TRANSLATE_TABLE = dict((ord(char), None) for char in string.punctuation)


class LIWC():
    """Top-level class"""

    def __init__(self, dict_path):
        self.lexicon, self.category_names = self._read_dic(dict_path)
        self.trie = self._build_trie(self.lexicon)

    def process_text(self, text):
        """Run LIWC on string"""

        tokenized = word_tokenize(text.lower().translate(TRANSLATE_TABLE))
        counts = defaultdict(int)
        dict_count = len(tokenized)

        for token in tokenized:
            classifications = list(self._parse_token(token))

            if not classifications:
                dict_count -= 1
            else:
                for category in classifications:
                    counts[category] += 1

        category_scores = {category: (
            counts[category] / len(tokenized)) * 100 for category in counts.keys()}

        return category_scores

    def process_df_mp(self, df, col):
        """Multi-process version of process_df"""
        cpu_count = mp.cpu_count()
        p = mp.Pool(cpu_count)

        batches = np.array_split(df, cpu_count)

        pool_results = p.starmap(self.process_df,[(batch, col) for batch in batches if len(batch) > 0])
        p.close()
        
        return pd.concat(pool_results, axis=0)

    def process_df(self, df, col):
        """Run LIWC on a dataframe column"""
        df[col] = df[col].astype(str)

        def apply_df(row, col):
            score = self.process_text(row[col])
            scores = {}
            
            for category in score:
                scores[category] = score[category]

            return pd.Series(scores)


        res = df.apply(apply_df, args=(col,), axis=1)

        return res


    def _read_dic(self, filepath):
        category_mapping = {}
        category_names = []
        lexicon = {}
        mode = 0    # the mode is incremented by each '%' line in the file
        with open(filepath) as dict_file:
            for line in dict_file:
                tsv = line.strip()
                if tsv:
                    parts = tsv.split('\t')
                    if parts[0] == '%':
                        mode += 1
                    elif mode == 1:
                        # definining categories
                        category_names.append(parts[1])
                        category_mapping[parts[0]] = parts[1]
                    elif mode == 2:
                        lexicon[parts[0]] = [category_mapping[category_id]
                                             for category_id in parts[1:]]
        return lexicon, category_names

    def _build_trie(self, lexicon):
        '''
        Build a character-trie from the plain pattern_string -> categories_list
        mapping provided by `lexicon`.

        Some LIWC patterns end with a `*` to indicate a wildcard match.
        '''
        trie = {}
        for pattern, category_names in lexicon.items():
            cursor = trie
            for char in pattern:
                if char == '*':
                    cursor['*'] = category_names
                    break
                if char not in cursor:
                    cursor[char] = {}
                cursor = cursor[char]
            cursor['$'] = category_names
        return trie

    def _search_trie(self, trie, token, token_i=0):
        '''
        Search the given character-trie for paths that match the `token` string.
        '''
        if '*' in trie:
            return trie['*']
        elif '$' in trie and token_i == len(token):
            return trie['$']
        elif token_i < len(token):
            char = token[token_i]
            if char in trie:
                return self._search_trie(trie[char], token, token_i + 1)
        return []

    def _parse_token(self, token):
        for category_name in self._search_trie(self.trie, token):
            yield category_name
            
            
''' 
Make LIWC feature extractor into class
'''


def makeLIWCDictionary(liwcPath, picklePath):
    '''
        Make lookup data structure from LIWC dictionary file
    '''
    LIWC_file = open(liwcPath, 'rb') # LIWC dictionary
    catNames = {}
    LIWC_file.readline() #skips first '%' line
    line = LIWC_file.readline()
    lookup = []
    while '%' not in line:
        keyval = line.split('\t')
        key = keyval[0]
        value = keyval[1].strip()
        catNames[key] = {'name' : value,
                         'words' : []}
        line = LIWC_file.readline()
    mapCategoriesToNumbers = catNames.keys()
    line = LIWC_file.readline() # skips second '%' line

    #return mapCategoriesToNumbers
    while line: #iterate through categories
        data = line.strip().split('\t')
        reString = '^'+data[0].replace('*', '.*') + '$'
        indeces = [mapCategoriesToNumbers.index(d) for d in data[1:]]
        lookupCell = (re.compile(reString), indeces)
        lookup.append(lookupCell)
        for cat in data[1:]:
            catNames[cat]['words'] += (data[0], reString)
        cats = data[1:]
        line = LIWC_file.readline()
    toPickle = {'categories' : catNames, 'lookup' : lookup, 'cat_to_num' : mapCategoriesToNumbers}
    pickle.dump(toPickle, open(picklePath, 'w'))
    return toPickle

class liwcExtractor():
    def __init__(self,
                tokenizer=None,
                ignore=None,
                dictionary=None,
                newCategories=None,
                keepNonDict=True,
                liwcPath=None):
        self.liwcPath = liwcPath
        self.dictionary = dictionary
        if tokenizer is None:
            self.tokenizer = self.nltk_tokenize
        if liwcPath is not None:
            self.dictionary = makeLIWCDictionary(liwcPath, './liwcDictionary.pickle')
            self.lookup = self.dictionary['lookup']
            self.categories = self.dictionary['categories']
            self.mapCategoriesToNumbers = self.dictionary['cat_to_num']
        elif self.dictionary==None:
            self.dictionary = makeLIWCDictionary(liwcPath, './liwcDictionary.pickle')
            self.lookup = self.dictionary['lookup']
            self.categories = self.dictionary['categories']
            self.mapCategoriesToNumbers = self.dictionary['cat_to_num']
        self.ignore = ignore
        self.newCategories = newCategories
        self.nonDictTokens = []
        self.keepNonDict = keepNonDict

    def getCategoryIndeces(self):
        indeces = [x['name'] for x in self.categories.values()]
        indeces += ['wc', 'sixltr','dic','punc','emoticon'] # These last two are not built yet.
        return indeces

    def extract(self, corpus):
        corpusFeatures = []
        for doc in corpus:
            features = self.extractFromDoc(doc)
            corpusFeatures.append(features)
        return corpusFeatures

    def extractFromDoc(self, document):
        tokens = self.tokenizer(document)
        #print tokens
        features = [0] * 70 # 66 = wc, total word count
                            # 67 = sixltr, six letter words
                            # 68 = dic, words found in LIWC dictionary
                            # 70 = punc, punctuation
                            # 71 = emoticon
        features[66] = len(tokens)

        for t in tokens: #iterating through tokens of a message
            #print "Token : " + t
            if len(t) > 6: # check if more than six letters
                features[67] += 1
            inDict = False
            for pattern, categories in self.lookup:
                if len(pattern.findall(t)) > 0:
                    inDict = True
                    for c in categories:
                        features[int(c)] += 1
            if inDict:
                features[68] += 1
            else:
                self.nonDictTokens.append(t)
        return features

    def patternsMatchedFromDoc(self, document):
        tokens = self.tokenizer(document)
        patterns = [l[0] for l in self.lookup]
        features = [0] * len(patterns)
        for t in tokens:
            for i, pattern in enumerate(patterns):
                if len(pattern.findall(t)) > 0:
                    features[i] += 1
        return features

    def nltk_tokenize(self, message):
        '''
            takes in a text string and returns a list of tokenized words using nltk methods
        '''
        # sentence tokenize
        stList = nltk.sent_tokenize(message)
        # word tokenize
        tokens = []
        for sent in stList:
            tokens += nltk.word_tokenize(sent)
        return tokens

In [4]:
import pandas as pd
import glob
import csv

path = r'C://Users/shiva/Desktop/RaviKrishna/txt/' # use your path
all_files = glob.glob(path + "*.txt")

li = []

for filename in all_files:
    disorder_type = str(filename[40:-4])
    df = pd.read_csv(filename, sep="\n", header=None,error_bad_lines=False,encoding='utf8', quoting=csv.QUOTE_NONE)
    df['Disorder Type'] = disorder_type 
    li.append(df)
tweetsDF = pd.concat(li, axis=0, ignore_index=True)

In [5]:
tweetsDF.columns = ['Tweets', 'Disorder Type']
print("Shape of the DataFrame", tweetsDF.shape)
tweetsDF.head()

Shape of the DataFrame (1916, 2)


Unnamed: 0,Tweets,Disorder Type
0,I have an assessment for therapy later. I went...,AD
1,Can GENERALIZED ANXIETY DISORDER lead to suici...,AD
2,en dit both ways. mensen met mental illness ze...,AD
3,"@BluemoonShell Yup. In the United States, we d...",AD
4,@iamWalkerR Heard thats a symtom from TAD: Tru...,AD


### Define Helper Functions

In [6]:
def clean_tweet(tweet):
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())


def analize_sentiment(tweet):
    # Simple implementation of the sgn(x) function to make the analysis more comprenesive. 
    
    analysis = TextBlob(clean_tweet(tweet))
    if analysis.sentiment.polarity > 0:
        return 1
    elif analysis.sentiment.polarity == 0:
        return 0
    else:
        return -1

In [8]:
from textblob import TextBlob
import numpy as np
import re

tweetsDF['Sentiment'] = np.array([ analize_sentiment(tweet) for tweet in tweetsDF['Tweets'] ])

pos_tweets = [ tweet for index, tweet in enumerate(tweetsDF['Tweets']) if tweetsDF['Sentiment'][index] > 0]
neu_tweets = [ tweet for index, tweet in enumerate(tweetsDF['Tweets']) if tweetsDF['Sentiment'][index] == 0]
neg_tweets = [ tweet for index, tweet in enumerate(tweetsDF['Tweets']) if tweetsDF['Sentiment'][index] < 0]

print("Percentage of positive tweets: {}%".format(len(pos_tweets)*100/len(tweetsDF['Tweets'])))
print("Percentage of neutral tweets: {}%".format(len(neu_tweets)*100/len(tweetsDF['Tweets'])))
print("Percentage of negative tweets: {}%".format(len(neg_tweets)*100/len(tweetsDF['Tweets'])))

display(tweetsDF.head(20))

Percentage of positive tweets: 29.07098121085595%
Percentage of neutral tweets: 49.26931106471816%
Percentage of negative tweets: 21.659707724425886%


Unnamed: 0,Tweets,Disorder Type,Sentiment
0,I have an assessment for therapy later. I went...,AD,0
1,Can GENERALIZED ANXIETY DISORDER lead to suici...,AD,0
2,en dit both ways. mensen met mental illness ze...,AD,-1
3,"@BluemoonShell Yup. In the United States, we d...",AD,0
4,@iamWalkerR Heard thats a symtom from TAD: Tru...,AD,1
5,"@bongwatersoda social anxiety, AvPD, BPD, bipo...",AD,1
6,Reinecke: interested in understanding which su...,AD,1
7,"More than 10,500 police officers across the UK...",AD,1
8,Mental Health Monday - Anxiety Disorder sympto...,AD,-1
9,Maternal mental health problems have been asso...,AD,-1


In [9]:
### Define more helper functions
def preprocess_tweet(tweet):
    #Preprocess the text in a single tweet
    #arguments: tweet = a single tweet in form of string 
    #convert the tweet to lower case
    tweet.lower()
    #convert all urls to sting "URL"
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
    #convert all @username to ""
    tweet = re.sub('@[^\s]+','', tweet)
    #correct all multiple white spaces to a single white space
    tweet = re.sub('[\s]+', ' ', tweet)
    #convert "#topic" to just "topic"
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    return tweet


# Clean a tweet
def clean_tweet(text):
    # Removal of URLs
    text = re.sub(r"http\S+", "", text)
    # Removal of mentions
    text = re.sub("@[^\s]*", "", text)
    # Removal of hashtags
    text = re.sub("#[^\s]*", "", text)
    # Removal of numbers
    text = re.sub('[0-9]*[+-:]*[0-9]+', '', text)
    # Convert to lowercase
    text = text.lower()
    # Apostrophe lookup
    text = re.sub("'ll", " will", text)
    text = re.sub("'ve", " have", text)
    text = re.sub("n't", " not", text)
    text = re.sub("'d", " would", text)
    text = re.sub("'re", " are", text)
    text = re.sub("i'm", "i am", text)
    text = re.sub("it's", "it is", text)
    text = re.sub("she's", "she is", text)
    text = re.sub("he's", "he is", text)
    text = re.sub("here's", "here is", text)
    text = re.sub("that's", "that is", text)
    text = re.sub("there's", "there is", text)
    text = re.sub("what's", "what is", text)
    text = re.sub("who's", "who is", text)
    text = re.sub("'s", "", text)
    # Handling slang words
    text = re.sub(r"\btmrw\b", "tomorrow", text)
    text = re.sub(r"\bur\b", "your", text)
    text = re.sub(r"\burs\b", "yours", text)
    text = re.sub(r"\bppl\b", "people", text)
    text = re.sub(r"\byrs\b", "years", text)
    # Handling acronyms
    text = re.sub(r"\b(rt)\b", "retweet", text)
    text = re.sub(r"\b(btw)\b", "by the way", text)
    text = re.sub(r"\b(asap)\b", "as soon as possible", text)
    text = re.sub(r"\b(fyi)\b", "for your information", text)
    text = re.sub(r"\b(tbt)\b", "throwback thursday", text)
    text = re.sub(r"\b(tba)\b", "to be announced", text)
    text = re.sub(r"\b(tbh)\b", "to be honest", text)
    text = re.sub(r"\b(faq)\b", "frequently asked questions", text)
    text = re.sub(r"\b(icymi)\b", "in case you missed it", text)
    text = re.sub(r"\b(aka)\b", "also known as", text)
    text = re.sub(r"\b(ama)\b", "ask me anything", text)
    # Word lemmatization
    lemmatizer = WordNetLemmatizer()
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])
#df['SentimentText'] = df['SentimentText'].apply(lambda text: clean_tweet(text))

def feature_extraction(data, method = "tfidf"):
    #arguments: data = all the tweets in the form of array, method = type of feature extracter
    #methods of feature extractions: "tfidf" and "doc2vec"
    if method == "tfidf":
        from sklearn.feature_extraction.text import TfidfVectorizer
        tfv=TfidfVectorizer(sublinear_tf=True, stop_words = "english") # we need to give proper stopwords list for better performance
        features=tfv.fit_transform(data)
    elif method == "doc2vec":
        None
    else:
        return "Incorrect inputs"
    return features


In [10]:
from bs4 import BeautifulSoup
from html.parser import HTMLParser
from nltk import WordNetLemmatizer
tweetsDF['Cleaned Tweets'] = tweetsDF['Tweets'].apply(preprocess_tweet)
tweetsDF['Cleaned Tweets'] = tweetsDF['Tweets'].apply(clean_tweet)

In [11]:
tweetsDF.head(10)

Unnamed: 0,Tweets,Disorder Type,Sentiment,Cleaned Tweets
0,I have an assessment for therapy later. I went...,AD,0,i have an assessment for therapy later. i went...
1,Can GENERALIZED ANXIETY DISORDER lead to suici...,AD,0,can generalized anxiety disorder lead to suicide?
2,en dit both ways. mensen met mental illness ze...,AD,-1,en dit both ways. mensen met mental illness ze...
3,"@BluemoonShell Yup. In the United States, we d...",AD,0,"yup. in the united states, we don’t have pda, ..."
4,@iamWalkerR Heard thats a symtom from TAD: Tru...,AD,1,heard thats a symtom from tad: trump anxiety d...
5,"@bongwatersoda social anxiety, AvPD, BPD, bipo...",AD,1,"social anxiety, avpd, bpd, bipolar disorder, a..."
6,Reinecke: interested in understanding which su...,AD,1,reinecke: interested in understanding which su...
7,"More than 10,500 police officers across the UK...",AD,1,more than police officer across the uk took ti...
8,Mental Health Monday - Anxiety Disorder sympto...,AD,-1,mental health monday - anxiety disorder sympto...
9,Maternal mental health problems have been asso...,AD,-1,maternal mental health problem have been assoc...


In [12]:
liwc = LIWC("LIWC2015_English_Flat.dic")

In [15]:
liwc_df = liwc.process_df(tweetsDF, col='Cleaned Tweets')

In [16]:
# Save the dataframe
liwc_df.to_csv('LIWC_features.csv', index = False)

### Adding Extra Features

In [17]:
import re
def adding_extra_feature(df, tweet_column):
    
    # Print Number of Exclamation
    #length_of_excl = (len(re.findall(r'!', string)))
    df['number_of_exclamation'] = tweet_column.apply(lambda x: (len(re.findall(r'!', x))))
    
    # Number of ?
    #length_of_questionmark = (len(re.findall(r'?', string)))
    df['number_of_questionmark'] = tweet_column.apply(lambda x: (len(re.findall(r'[?]', x))))
    
    # Number of #
    df['number_of_hashtag'] = tweet_column.apply(lambda x: (len(re.findall(r'#', x))))
    
    # Number of @
    df['number_of_mention'] = tweet_column.apply(lambda x: (len(re.findall(r'@', x))))
    
    # Number of Quotes
    df['number_of_quotes'] = tweet_column.apply(lambda x: (len(re.findall(r"'", x))))

    # Number if underscore
    df['number_of_underscore'] = tweet_column.apply(lambda x: (len(re.findall(r'_', x))))
    
    
    return df

In [18]:
tweetsDF = adding_extra_feature(tweetsDF, tweetsDF["Tweets"])

In [22]:
tweetsDF.describe()

Unnamed: 0,Sentiment,number_of_exclamation,number_of_questionmark,number_of_hashtag,number_of_mention,number_of_quotes,number_of_underscore
count,1916.0,1916.0,1916.0,1916.0,1916.0,1916.0,1916.0
mean,0.074113,0.053758,0.087683,0.163361,0.508873,0.15762,0.106994
std,0.708573,0.307829,0.356418,0.717505,1.155495,0.478465,0.440191
min,-1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.0,0.0,0.0,1.0,0.0,0.0
max,1.0,7.0,6.0,11.0,9.0,4.0,5.0


### Adding Emoticons

- Here, users emoticons in a tweet also matters, so we will find the emoticons in a users tweet.

In [24]:
## Emoticon Detector

class EmoticonDetector:
    emoticons = {}

    def __init__(self, emoticon_file="emoticons.txt"):
        from pathlib import Path
        content = Path(emoticon_file).read_text()
        positive = True
        for line in content.split("\n"):
            if "positive" in line.lower():
                positive = True
                continue
            elif "negative" in line.lower():
                positive = False
                continue

            self.emoticons[line] = positive

    def is_positive(self, emoticon):
        if emoticon in self.emoticons:
            return self.emoticons[emoticon]
        return False

    def is_emoticon(self, to_check):
        return to_check in self.emoticons
ed = EmoticonDetector()

processed_data = tweetsDF.copy()

def add_column(column_name, column_content):
    processed_data.loc[:, column_name] = pd.Series(column_content, index=processed_data.index)

def count_by_lambda(expression, word_array):
    return len(list(filter(expression, word_array)))

add_column("splitted_text", map(lambda txt: txt.split(" "), processed_data["Tweets"]))

positive_emo = list(
    map(lambda txt: count_by_lambda(lambda word: ed.is_emoticon(word) and ed.is_positive(word), txt),
        processed_data["splitted_text"]))
add_column("number_of_positive_emo", positive_emo)

negative_emo = list(map(
    lambda txt: count_by_lambda(lambda word: ed.is_emoticon(word) and not ed.is_positive(word), txt),
    processed_data["splitted_text"]))

add_column("number_of_negative_emo", negative_emo)

In [32]:
emoticons_df = processed_data[['number_of_positive_emo', 'number_of_negative_emo']] 

In [33]:
# Save the emoticons dataframe
emoticons_df.to_csv('emoticons_df.csv', index=False)