In [35]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:
#Loading the dataset
df=pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin-1', header=None)

#Assign column names
df.columns=['target','id','date','flag','user','text']

df['target']=df['target'].map({0:0,2:1,4:1})

#displayiing data
df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [3]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
def preprocess(text):
    #removing all special characters for getting pure text
    text=re.sub(r"http\S+|@\S+|#\S+|[^A-Za-z\s]","",text)
    tokens=word_tokenize(text)
    tokens=[word for word in tokens if word.lower() not in stopwords.words('english')]
    #Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens=[lemmatizer.lemmatize(word) for word in tokens]
    return " ".join(tokens)

In [5]:
df['text']=df['text'].apply(preprocess)
df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,Awww thats bummer shoulda got David Carr Third...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,upset cant update Facebook texting might cry r...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,dived many time ball Managed save rest go bound
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,whole body feel itchy like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,behaving im mad cant see


In [31]:
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

#initialize tokenizer
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['text'])

#convert text to integers
x=tokenizer.texts_to_sequences(df['text'])
x=pad_sequences(x,maxlen=100)
print(x.shape)

(1600000, 100)


In [32]:
#Labeling
y=df['target'].values
print(y.shape)

(1600000,)


In [33]:
#creating train and test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
print(x_train.shape, x_test.shape)

(1280000, 100) (320000, 100)


In [36]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout

model=Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=100))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary

<bound method Model.summary of <Sequential name=sequential_1, built=False>>

In [37]:
#training the model
his=model.fit(x_train,y_train,epochs=5,batch_size=40,validation_split=0.2)

Epoch 1/5
[1m25600/25600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2990s[0m 117ms/step - accuracy: 0.7611 - loss: 0.4883 - val_accuracy: 0.7854 - val_loss: 0.4522
Epoch 2/5
[1m25600/25600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3195s[0m 125ms/step - accuracy: 0.7904 - loss: 0.4443 - val_accuracy: 0.7893 - val_loss: 0.4459
Epoch 3/5
[1m25600/25600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2917s[0m 114ms/step - accuracy: 0.7968 - loss: 0.4334 - val_accuracy: 0.7901 - val_loss: 0.4446
Epoch 4/5
[1m25600/25600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2918s[0m 114ms/step - accuracy: 0.8028 - loss: 0.4235 - val_accuracy: 0.7894 - val_loss: 0.4462
Epoch 5/5
[1m25600/25600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3213s[0m 126ms/step - accuracy: 0.8069 - loss: 0.4157 - val_accuracy: 0.7907 - val_loss: 0.4452


In [39]:
accuracy=model.evaluate(x_test,y_test)
print(f'Accuracy:{accuracy}')

[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m330s[0m 33ms/step - accuracy: 0.7914 - loss: 0.4448
Accuracy:[0.4448949694633484, 0.7915906310081482]


In [53]:
def predict_depression(text):
    processed_text = preprocess(text)
    sequence = tokenizer.texts_to_sequences([processed_text])
    padded_sequence = pad_sequences(sequence, maxlen=100)
    prediction = model.predict(padded_sequence)
    return 'Depressed' if prediction[0][0] < 0.5 else 'Not Depressed'

In [62]:
while True:
    a=input("CHOICES...\n1.Enter a tweet to check depression?\n2.Exit the model\nEnter your choice:")
    n=int(a)
    if n==1:
        tweet=input("Enter the tweet:")
        print(predict_depression(tweet))
    elif n==2:
        print("THANKS for using our model")
        break
    else:
        print("invalid input")

CHOICES...
1.Enter a tweet to check depression?
2.Exit the model
Enter your choice: 1
Enter the tweet: I don't feel like going work today.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
Depressed


CHOICES...
1.Enter a tweet to check depression?
2.Exit the model
Enter your choice: 1
Enter the tweet: Virat Kohli is indeed a king.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
Not Depressed


CHOICES...
1.Enter a tweet to check depression?
2.Exit the model
Enter your choice: 2


THANKS for using our model
