In [1]:
import numpy as np
import pandas as pd

In [7]:
# Load the depression data from the csv file
df = pd.read_csv("depression_data.csv")
print(df.shape)
df.head()

(3200, 3)


Unnamed: 0.1,Unnamed: 0,tweet,target
0,0,Today in Selfcare: beauty &amp; laughs Kung Fu...,0
1,1,I get to spend New Year's home again alone and...,1
2,2,"Depressed and lonely /: Stuck in a deep, never...",1
3,3,If this is your response to someone saying the...,0
4,4,Apparently you get a free pass just by mention...,0


In [18]:
# Extract the input and output from the dataframe
x_train = df["tweet"]
y_train = df["target"]

x_train[0]

'Today in Selfcare: beauty &amp; laughs Kung Fu Panda 3 #Wellness #joy #laughter #selfcare #therapist #philadelphia'

In [17]:
from nltk.tokenize import word_tokenize
word_tokenize(x_train[0].lower())

['today',
 'in',
 'selfcare',
 ':',
 'beauty',
 '&',
 'amp',
 ';',
 'laughs',
 'kung',
 'fu',
 'panda',
 '3',
 '#',
 'wellness',
 '#',
 'joy',
 '#',
 'laughter',
 '#',
 'selfcare',
 '#',
 'therapist',
 '#',
 'philadelphia']

In [42]:
documents = []
for i in range(len(x_train)):
    documents.append((word_tokenize(x_train[i].lower()) ,y_train[i]))
documents[0]   

(['today',
  'in',
  'selfcare',
  ':',
  'beauty',
  '&',
  'amp',
  ';',
  'laughs',
  'kung',
  'fu',
  'panda',
  '3',
  '#',
  'wellness',
  '#',
  'joy',
  '#',
  'laughter',
  '#',
  'selfcare',
  '#',
  'therapist',
  '#',
  'philadelphia'],
 0)

In [43]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [44]:
from nltk.corpus import wordnet
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [45]:
from nltk.corpus import stopwords
import string
stops = stopwords.words('english') + list(string.punctuation)
stops

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [46]:
from nltk import pos_tag
w = "better"
pos_tag([w])

[('better', 'RBR')]

In [71]:
def clean_data(words):
    output_words = []
    for w in words:
        if w.lower() not in stops:
            pos = pos_tag([w])                                 
            clean_word = lemmatizer.lemmatize(w, get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [72]:
import time
start = time.time()
documents = [(clean_data(document), category) for document, category in documents]
end = time.time()
print("Cleaning time: ", end - start)

Cleaning time:  2.556807279586792


In [77]:
documents[0]

(['today',
  'selfcare',
  'beauty',
  'amp',
  'laugh',
  'kung',
  'fu',
  'panda',
  '3',
  'wellness',
  'joy',
  'laughter',
  'selfcare',
  'therapist',
  'philadelphia'],
 0)

In [78]:
print(len(documents))

3200


In [79]:
# Split into training and testing data
training_documents = documents[0:2300]
testing_documents = documents[2300:]

In [97]:
all_words = []
for doc in training_documents:
    all_words += doc[0]

In [98]:
import nltk
freq = nltk.FreqDist(all_words)                 #will retrurn a freq distribution object
print(len(freq))
common = freq.most_common(1000)
features = [i[0] for i in common]
features

5841


["n't",
 "'s",
 "'m",
 'feel',
 'day',
 'get',
 '...',
 'like',
 'go',
 '’',
 'year',
 'life',
 'happy',
 'love',
 'make',
 'one',
 'want',
 'u',
 'time',
 'today',
 'see',
 'know',
 'well',
 'need',
 'people',
 '..',
 'look',
 'much',
 'think',
 'ca',
 'good',
 'really',
 'smile',
 'new',
 'thing',
 "'re",
 'start',
 'never',
 'would',
 'take',
 'try',
 'back',
 'amp',
 'right',
 "'ve",
 'always',
 'watch',
 'great',
 'come',
 'even',
 'help',
 'bad',
 'work',
 'someone',
 '”',
 'na',
 'still',
 'ever',
 'talk',
 'say',
 'hope',
 'way',
 '2',
 'best',
 'amaze',
 'depression',
 'w',
 'wish',
 'friend',
 'let',
 'christmas',
 'depressed',
 '``',
 'keep',
 "'ll",
 'cry',
 'last',
 'fuck',
 '\xad',
 'morning',
 "''",
 'alone',
 'hate',
 'could',
 'person',
 'lose',
 'miss',
 'sad',
 'laugh',
 'give',
 '....',
 '.......',
 'bed',
 'birthday',
 'thought',
 'end',
 'heart',
 'happiness',
 'hard',
 'stop',
 'god',
 'night',
 'wait',
 'world',
 'week',
 'nothing',
 'pain',
 'may',
 'thank',
 '

In [99]:
def get_feature_dict(words):
    current_features = {}
    words_set = set(words)
    for w in features:
        current_features[w] = w in words_set
    return current_features

In [100]:
training_data = [(get_feature_dict(doc), category) for doc, category in training_documents]
testing_data = [(get_feature_dict(doc), category) for doc, category in testing_documents]

In [101]:
type(training_data)

list

In [102]:
training_data[0]

({"n't": False,
  "'s": False,
  "'m": False,
  'feel': False,
  'day': False,
  'get': False,
  '...': False,
  'like': False,
  'go': False,
  '’': False,
  'year': False,
  'life': False,
  'happy': False,
  'love': False,
  'make': False,
  'one': False,
  'want': False,
  'u': False,
  'time': False,
  'today': True,
  'see': False,
  'know': False,
  'well': False,
  'need': False,
  'people': False,
  '..': False,
  'look': False,
  'much': False,
  'think': False,
  'ca': False,
  'good': False,
  'really': False,
  'smile': False,
  'new': False,
  'thing': False,
  "'re": False,
  'start': False,
  'never': False,
  'would': False,
  'take': False,
  'try': False,
  'back': False,
  'amp': True,
  'right': False,
  "'ve": False,
  'always': False,
  'watch': False,
  'great': False,
  'come': False,
  'even': False,
  'help': False,
  'bad': False,
  'work': False,
  'someone': False,
  '”': False,
  'na': False,
  'still': False,
  'ever': False,
  'talk': False,
  'say': Fa

In [103]:
#Classification using NLTK Naive Bayes

from nltk import NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(training_data)

In [104]:
nltk.classify.accuracy(classifier, testing_data)

0.7744444444444445

In [105]:
classifier.show_most_informative_features(15)

Most Informative Features
                     amp = True                0 : 1      =     12.2 : 1.0
                  broken = True                1 : 0      =     11.9 : 1.0
                     hat = True                1 : 0      =     11.9 : 1.0
                     sad = True                1 : 0      =     10.4 : 1.0
                    mess = True                1 : 0      =     10.1 : 1.0
                  bother = True                1 : 0      =      8.2 : 1.0
                    felt = True                1 : 0      =      8.2 : 1.0
                  forgot = True                1 : 0      =      8.2 : 1.0
                mentally = True                1 : 0      =      8.2 : 1.0
                   space = True                1 : 0      =      8.2 : 1.0
                   laugh = True                0 : 1      =      7.9 : 1.0
               happiness = True                0 : 1      =      7.6 : 1.0
                    suck = True                1 : 0      =      7.2 : 1.0

In [112]:
# Now predict the new data
# Load data
file = open("test.txt", "r");
rawData = file.read()
file.close()
arrayData = rawData.split("\n")
arrayData = [ data.split(";")[0] for data in arrayData]

In [114]:
# Convert it into document which will be array of array of words using
inputDocuments = []
for i in range(len(arrayData)):
    inputDocuments.append(word_tokenize(arrayData[i].lower()))
inputDocuments[8] 

['i',
 'like',
 'to',
 'have',
 'the',
 'same',
 'breathless',
 'feeling',
 'as',
 'a',
 'reader',
 'eager',
 'to',
 'see',
 'what',
 'will',
 'happen',
 'next']

In [116]:
# Clean it
inputDocuments = [clean_data(document) for document in inputDocuments]
inputDocuments[8]

['like', 'breathless', 'feel', 'reader', 'eager', 'see', 'happen', 'next']

In [119]:
# Get feature dictionary format for Naive Bayes classification 
predection_data_x = [get_feature_dict(doc) for doc in inputDocuments]
predection_data_x[8]

{"n't": False,
 "'s": False,
 "'m": False,
 'feel': True,
 'day': False,
 'get': False,
 '...': False,
 'like': True,
 'go': False,
 '’': False,
 'year': False,
 'life': False,
 'happy': False,
 'love': False,
 'make': False,
 'one': False,
 'want': False,
 'u': False,
 'time': False,
 'today': False,
 'see': True,
 'know': False,
 'well': False,
 'need': False,
 'people': False,
 '..': False,
 'look': False,
 'much': False,
 'think': False,
 'ca': False,
 'good': False,
 'really': False,
 'smile': False,
 'new': False,
 'thing': False,
 "'re": False,
 'start': False,
 'never': False,
 'would': False,
 'take': False,
 'try': False,
 'back': False,
 'amp': False,
 'right': False,
 "'ve": False,
 'always': False,
 'watch': False,
 'great': False,
 'come': False,
 'even': False,
 'help': False,
 'bad': False,
 'work': False,
 'someone': False,
 '”': False,
 'na': False,
 'still': False,
 'ever': False,
 'talk': False,
 'say': False,
 'hope': False,
 'way': False,
 '2': False,
 'best': Fal

In [123]:
# Save the prediction using
predection_data_y=[]
for i in range(len(predection_data_x)):
    predection_data_y.append(classifier.classify(predection_data_x[i]))

In [125]:
predection_data_y[8]

0

In [126]:
# Save it to a csv file
dfOutput= pd.DataFrame(predection_data_y)
dfOutput.to_csv('test_predection.csv',header=False,index=False)