In [48]:
import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd
import sklearn
import regex as re
import emoji
import nltk
import matplotlib.pyplot as plt
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from tensorflow.keras.models import Sequential
from keras import layers
from tensorflow.keras.layers import Reshape
from tensorflow.keras.layers import Dropout
import keras
from keras.regularizers import l1, l2

In [49]:
df=pd.read_excel("Dataset.xlsx")
df['text'].dropna()
df

Unnamed: 0,text,label,Unnamed: 3
0,going through depression open this thread,1,
1,fivebalanceusa good afternoon would honored to...,0,
2,fkin depressed gonna feeling like this,1,
3,when have anxiety every pleasure guilty pleasure,0,
4,sorry canxexxt talk right ixexxm doing girl sh...,1,
...,...,...,...
7693,that frequently,0,
7694,counterargument krugman predicted great depres...,1,
7695,think dignity overrated that status doled limi...,1,
7696,cant make depression disappear over night sure...,0,


In [50]:
#removing emojis
df["text"]=df["text"].astype(str)
df["text"]=df["text"].apply(emoji.demojize)
df

Unnamed: 0,text,label,Unnamed: 3
0,going through depression open this thread,1,
1,fivebalanceusa good afternoon would honored to...,0,
2,fkin depressed gonna feeling like this,1,
3,when have anxiety every pleasure guilty pleasure,0,
4,sorry canxexxt talk right ixexxm doing girl sh...,1,
...,...,...,...
7693,that frequently,0,
7694,counterargument krugman predicted great depres...,1,
7695,think dignity overrated that status doled limi...,1,
7696,cant make depression disappear over night sure...,0,


In [51]:
#removing URls
df["text"] = df["text"].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)
df

Unnamed: 0,text,label,Unnamed: 3
0,going through depression open this thread,1,
1,fivebalanceusa good afternoon would honored to...,0,
2,fkin depressed gonna feeling like this,1,
3,when have anxiety every pleasure guilty pleasure,0,
4,sorry canxexxt talk right ixexxm doing girl sh...,1,
...,...,...,...
7693,that frequently,0,
7694,counterargument krugman predicted great depres...,1,
7695,think dignity overrated that status doled limi...,1,
7696,cant make depression disappear over night sure...,0,


In [52]:
#removing non english words
words = set(nltk.corpus.words.words())
def removeNonEng(sent):
    k=" ".join(w for w in nltk.wordpunct_tokenize(sent) \
    if w.lower() in words or not w.isalpha())
    return k

df["text"]=df["text"].apply(removeNonEng)
df

Unnamed: 0,text,label,Unnamed: 3
0,going through depression open this thread,1,
1,good afternoon would took look project,0,
2,depressed feeling like this,1,
3,when have anxiety every pleasure guilty pleasure,0,
4,sorry talk right doing girl laundry depressed,1,
...,...,...,...
7693,that frequently,0,
7694,counterargument great depression march dead sq...,1,
7695,think dignity that status limited supply shame...,1,
7696,cant make depression disappear over night sure...,0,


In [53]:
#removing stopwords
stop_words= stopwords.words('english')
personal_pronouns= ['i', 'you', 'she', 'he', 'they'] #keeping personal pronouns to serve as a feature

for word in personal_pronouns:
    if word in stop_words:
        stop_words.remove(word)

df["text"].apply(lambda x: [item for item in x if item not in stop_words])
df

Unnamed: 0,text,label,Unnamed: 3
0,going through depression open this thread,1,
1,good afternoon would took look project,0,
2,depressed feeling like this,1,
3,when have anxiety every pleasure guilty pleasure,0,
4,sorry talk right doing girl laundry depressed,1,
...,...,...,...
7693,that frequently,0,
7694,counterargument great depression march dead sq...,1,
7695,think dignity that status limited supply shame...,1,
7696,cant make depression disappear over night sure...,0,


In [54]:
#vectorizing natural language
X=df["text"]
y=df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [55]:
#logistic regression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 score:", f1_score(y_test, y_pred))

Accuracy: 0.7915584415584416
Precision: 0.7882632831086439
Recall: 0.9484732824427481
F1 score: 0.8609787786920744


In [56]:
#knn
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 score:", f1_score(y_test, y_pred))

Accuracy: 0.42337662337662335
Precision: 0.8571428571428571
Recall: 0.183206106870229
F1 score: 0.30188679245283023


In [57]:
#decision tree
clf = DecisionTreeClassifier()
clf = clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 score:", f1_score(y_test, y_pred))

Accuracy: 0.7668831168831168
Precision: 0.821062441752097
Recall: 0.8406488549618321
F1 score: 0.8307402168788308


In [58]:
#random forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 score:", f1_score(y_test, y_pred))

Accuracy: 0.814935064935065
Precision: 0.8224852071005917
Recall: 0.9284351145038168
F1 score: 0.8722545943523083


In [59]:
#svm
clf = svm.SVC(kernel='linear')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 score:", f1_score(y_test, y_pred))

Accuracy: 0.812987012987013
Precision: 0.8281519861830743
Recall: 0.9150763358778626
F1 score: 0.8694469628286492


In [60]:
#cnn
from tensorflow.python.keras import regularizers
input_dim = X_train.shape[1]
model = Sequential()
model.add(layers.Dense(12, input_dim=input_dim, activation='relu', kernel_regularizer=l2(0.01)))
model.add(Dropout(0.2, input_shape=(60,)))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(**kwargs)


In [61]:
history = model.fit(X_train, y_train, epochs=30, validation_data=(X_test, y_test), batch_size=30)

Epoch 1/30
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.6873 - loss: 0.7078 - val_accuracy: 0.6805 - val_loss: 0.6249
Epoch 2/30
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6908 - loss: 0.6150 - val_accuracy: 0.6805 - val_loss: 0.6197
Epoch 3/30
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6834 - loss: 0.6184 - val_accuracy: 0.6805 - val_loss: 0.6178
Epoch 4/30
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6924 - loss: 0.6080 - val_accuracy: 0.6805 - val_loss: 0.6158
Epoch 5/30
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6908 - loss: 0.6063 - val_accuracy: 0.6805 - val_loss: 0.6133
Epoch 6/30
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6883 - loss: 0.6057 - val_accuracy: 0.6805 - val_loss: 0.6109
Epoch 7/30
[1m206/206[0m 

In [62]:
y_pred = model.predict(X_test)
y_pred = y_pred.flatten()
y_pred = (y_pred>0.6).astype(int)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 score:", f1_score(y_test, y_pred))

[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
Accuracy: 0.8246753246753247
Precision: 0.8635514018691589
Recall: 0.8816793893129771
F1 score: 0.8725212464589235


In [47]:

bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")
# Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)
# Neural network layers
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)
# Use inputs and outputs to construct a final model
model = tf.keras.Model(inputs=[text_input], outputs = [l])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=2, batch_size = 32)
y_predicted = model.predict(X_test)
y_predicted = y_predicted.flatten()
print(y_predicted)

RuntimeError: Op type not registered 'CaseFoldUTF8' in binary running on VARUN-LAPTOP. Make sure the Op and Kernel are registered in the binary running in this process. Note that if you are loading a saved graph which used ops from tf.contrib (e.g. `tf.contrib.resampler`), accessing should be done before importing the graph, as contrib ops are lazily registered when the module is first accessed.