In [1]:
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
# ML Libraries
from sklearn.metrics import accuracy_score,classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB

# Global Parameters
stop_words = set(stopwords.words('english'))

In [2]:
def load_dataset(filename, cols):
    dataset = pd.read_csv(filename, encoding='latin-1')
    dataset.columns = cols
    return dataset

In [3]:
def remove_unwanted_cols(dataset, cols):
    for col in cols:
        del dataset[col]
    return dataset

In [4]:
def preprocess_tweet_text(tweet):
    # Remove urls
    tweet=str(tweet)
    tweet.lower()
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)
    # Remove user @ references and '#' from tweet
    tweet = re.sub(r'\@\w+|\#','', tweet)
    # Remove punctuations
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    # Remove stopwords
   
    tweet = re.sub(r"he's", "he is", tweet)
    tweet = re.sub(r"there's", "there is", tweet)
    tweet = re.sub(r"We're", "We are", tweet)
    tweet = re.sub(r"That's", "That is", tweet)
    tweet = re.sub(r"won't", "will not", tweet)
    tweet = re.sub(r"they're", "they are", tweet)
    tweet = re.sub(r"Can't", "Cannot", tweet)
    tweet = re.sub(r"wasn't", "was not", tweet)
    tweet = re.sub(r"don\x89Ûªt", "do not", tweet)
    tweet = re.sub(r"aren't", "are not", tweet)
    tweet = re.sub(r"isn't", "is not", tweet)
    tweet = re.sub(r"What's", "What is", tweet)
    tweet = re.sub(r"haven't", "have not", tweet)
    tweet = re.sub(r"hasn't", "has not", tweet)
    tweet = re.sub(r"There's", "There is", tweet)
    tweet = re.sub(r"He's", "He is", tweet)
    tweet = re.sub(r"It's", "It is", tweet)
    tweet = re.sub(r"You're", "You are", tweet)
    tweet = re.sub(r"I'M", "I am", tweet)
    tweet = re.sub(r"shouldn't", "should not", tweet)
    tweet = re.sub(r"wouldn't", "would not", tweet)
    tweet = re.sub(r"i'm", "I am", tweet)
    tweet = re.sub(r"I\x89Ûªm", "I am", tweet)
    tweet = re.sub(r"I'm", "I am", tweet)
    tweet = re.sub(r"Isn't", "is not", tweet)
    tweet = re.sub(r"Here's", "Here is", tweet)
    tweet = re.sub(r"you've", "you have", tweet)
    tweet = re.sub(r"you\x89Ûªve", "you have", tweet)
    tweet = re.sub(r"we're", "we are", tweet)
    tweet = re.sub(r"what's", "what is", tweet)
    tweet = re.sub(r"couldn't", "could not", tweet)
    tweet = re.sub(r"we've", "we have", tweet)
    tweet = re.sub(r"it\x89Ûªs", "it is", tweet)
    tweet = re.sub(r"doesn\x89Ûªt", "does not", tweet)
    tweet = re.sub(r"It\x89Ûªs", "It is", tweet)
    tweet = re.sub(r"Here\x89Ûªs", "Here is", tweet)
    tweet = re.sub(r"who's", "who is", tweet)
    tweet = re.sub(r"I\x89Ûªve", "I have", tweet)
    tweet = re.sub(r"y'all", "you all", tweet)
    tweet = re.sub(r"can\x89Ûªt", "cannot", tweet)
    tweet = re.sub(r"would've", "would have", tweet)
    tweet = re.sub(r"it'll", "it will", tweet)
    tweet = re.sub(r"we'll", "we will", tweet)
    tweet = re.sub(r"wouldn\x89Ûªt", "would not", tweet)
    tweet = re.sub(r"We've", "We have", tweet)
    tweet = re.sub(r"he'll", "he will", tweet)
    tweet = re.sub(r"Y'all", "You all", tweet)
    tweet = re.sub(r"Weren't", "Were not", tweet)
    tweet = re.sub(r"Didn't", "Did not", tweet)
    tweet = re.sub(r"they'll", "they will", tweet)
    tweet = re.sub(r"they'd", "they would", tweet)
    tweet = re.sub(r"DON'T", "DO NOT", tweet)
    tweet = re.sub(r"That\x89Ûªs", "That is", tweet)
    tweet = re.sub(r"they've", "they have", tweet)
    tweet = re.sub(r"i'd", "I would", tweet)
    tweet = re.sub(r"should've", "should have", tweet)
    tweet = re.sub(r"You\x89Ûªre", "You are", tweet)
    tweet = re.sub(r"where's", "where is", tweet)
    tweet = re.sub(r"Don\x89Ûªt", "Do not", tweet)
    tweet = re.sub(r"we'd", "we would", tweet)
    tweet = re.sub(r"i'll", "I will", tweet)
    tweet = re.sub(r"weren't", "were not", tweet)
    tweet = re.sub(r"They're", "They are", tweet)
    tweet = re.sub(r"Can\x89Ûªt", "Cannot", tweet)
    tweet = re.sub(r"you\x89Ûªll", "you will", tweet)
    tweet = re.sub(r"I\x89Ûªd", "I would", tweet)
    tweet = re.sub(r"let's", "let us", tweet)
    tweet = re.sub(r"it's", "it is", tweet)
    tweet = re.sub(r"can't", "cannot", tweet)
    tweet = re.sub(r"don't", "do not", tweet)
    tweet = re.sub(r"you're", "you are", tweet)
    tweet = re.sub(r"i've", "I have", tweet)
    tweet = re.sub(r"that's", "that is", tweet)
    tweet = re.sub(r"i'll", "I will", tweet)
    tweet = re.sub(r"doesn't", "does not", tweet)
    tweet = re.sub(r"i'd", "I would", tweet)
    tweet = re.sub(r"didn't", "did not", tweet)
    tweet = re.sub(r"ain't", "am not", tweet)
    tweet = re.sub(r"you'll", "you will", tweet)
    tweet = re.sub(r"I've", "I have", tweet)
    tweet = re.sub(r"Don't", "do not", tweet)
    tweet = re.sub(r"I'll", "I will", tweet)
    tweet = re.sub(r"I'd", "I would", tweet)
    tweet = re.sub(r"Let's", "Let us", tweet)
    tweet = re.sub(r"you'd", "You would", tweet)
    tweet = re.sub(r"It's", "It is", tweet)
    tweet = re.sub(r"Ain't", "am not", tweet)
    tweet = re.sub(r"Haven't", "Have not", tweet)
    tweet = re.sub(r"Could've", "Could have", tweet)
    tweet = re.sub(r"youve", "you have", tweet)  
    tweet = re.sub(r"donå«t", "do not", tweet)  
    tweet = re.sub(r"some1", "someone", tweet)
    tweet = re.sub(r"yrs", "years", tweet)
    tweet = re.sub(r"hrs", "hours", tweet)
    tweet = re.sub(r"2morow|2moro", "tomorrow", tweet)
    tweet = re.sub(r"2day", "today", tweet)
    tweet = re.sub(r"4got|4gotten", "forget", tweet)
    tweet = re.sub(r"b-day|bday", "b-day", tweet)
    tweet = re.sub(r"mother's", "mother", tweet)
    tweet = re.sub(r"mom's", "mom", tweet)
    tweet = re.sub(r"dad's", "dad", tweet)
    tweet = re.sub(r"hahah|hahaha|hahahaha", "haha", tweet)
    tweet = re.sub(r"lmao|lolz|rofl", "lol", tweet)
    tweet = re.sub(r"thanx|thnx", "thanks", tweet)
    tweet = re.sub(r"goood", "good", tweet)
    tweet = re.sub(r"some1", "someone", tweet)
    tweet = re.sub(r"some1", "someone", tweet)
    tweet_tokens = word_tokenize(tweet)
    filtered_words = [w for w in tweet_tokens if not w in stop_words]
    
    #ps = PorterStemmer()
    #stemmed_words = [ps.stem(w) for w in filtered_words]
    #lemmatizer = WordNetLemmatizer()
    #lemma_words = [lemmatizer.lemmatize(w, pos='a') for w in stemmed_words]
    
    return " ".join(filtered_words)

In [5]:
def get_feature_vector(train_fit):
    vector = TfidfVectorizer(sublinear_tf=True)
    vector.fit(train_fit)
    return vector

In [6]:
def int_to_string(sentiment):
    if sentiment == 0:
        return "Negative"
    elif sentiment == 2:
        return "Neutral"
    else:
        return "Positive"

In [7]:
# Load dataset
dataset = load_dataset("depression_detection_dataset.csv", ['Unnamed','Text', 'Class'])
# Remove unwanted columns from dataset
#Preprocess data
dataset.text = dataset['Text'].apply(preprocess_tweet_text)
print(dataset.text)
# Split dataset into Train, Test
dataset.isnull().sum()

0        Ex Wife Threatening 1Recently I left wife good...
1        Am I weird I dont get affected compliments com...
2        Finally 2020 almost So I never hear 2020 bad y...
3                        need helpjust help im crying hard
4                                                     kill
                               ...                        
10207    My life fucking boring I got excited try new M...
10208    This sucksIve really fallen deep hole Living p...
10209    Bros didnt tell penis inspection wasnt meme I ...
10210    I want kill tonight Im trying get attentionI b...
10211                                           I want die
Name: Text, Length: 10212, dtype: object


  dataset.text = dataset['Text'].apply(preprocess_tweet_text)


Unnamed    2
Text       5
Class      8
dtype: int64

In [8]:
dataset = dataset.dropna()

In [9]:
dataset.isnull().sum()

Unnamed    0
Text       0
Class      0
dtype: int64

In [10]:
# Same tf vector will be used for Testing sentiments on unseen trending data
from sklearn.feature_extraction.text import CountVectorizer
corpus = dataset.Text
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names_out()
#print("Vocabulary: ", vectorizer.vocabulary_)
X=X.toarray()
X=pd.DataFrame(X)
# X

# # Same tf vector will be used for Testing sentiments on unseen trending data
# tf_vector = get_feature_vector(np.array(dataset.Text).ravel())
# X = tf_vector.transform(np.array(dataset.Text).ravel())
# y = np.array(dataset.Unnamed).ravel()
# X=X.toarray()
# X=pd.DataFrame(X)

y=dataset.Class
y=pd.DataFrame(y)
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,28630,28631,28632,28633,28634,28635,28636,28637,28638,28639
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10198,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10199,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10200,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10201,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# #y=pd.DataFrame(y)
# y=dataset.Class
# y=pd.DataFrame(y)
# y

X.shape
y.shape

(10203, 1)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
X_train.shape

(8162, 28640)

In [14]:
#Training Naive Bayes model
NB_model = MultinomialNB()

In [15]:
NB_model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [16]:
y_predict_nb = NB_model.predict(X_test)

In [17]:
print(accuracy_score(y_test, y_predict_nb))

0.8515433610975012


In [18]:
print(classification_report(y_test, y_predict_nb))

                                                          precision    recall  f1-score   support

 with our intelligence and our knowledge of the cosmos."       0.00      0.00      0.00         1
                                                       0       0.91      0.78      0.84      1023
                                                       1       0.81      0.93      0.86      1017

                                                accuracy                           0.85      2041
                                               macro avg       0.57      0.57      0.57      2041
                                            weighted avg       0.86      0.85      0.85      2041



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
# from sklearn.naive_bayes import MultinomialNB
# mnb = MultinomialNB()
# mnb.fit(X_train,y_train)

In [20]:
# y_pred4 = mnb.predict(X_test)
# accuracy_score(y_test,y_pred4)

In [21]:
# Training Logistics Regression model
# LR_model = LogisticRegression(solver='lbfgs')
# LR_model.fit(X_train, y_train)
# y_predict_lr = LR_model.predict(X_test)
# print(accuracy_score(y_test, y_predict_lr))

In [22]:
#dtc = DecisionTreeClassifier()

In [23]:
#dtc.fit(X_train,y_train)

In [24]:
#y_pred = dtc.predict(X_test)

In [25]:
#print(accuracy_score(y_test,y_pred))

In [26]:
#BNBmodel=BernoulliNB()

In [27]:
#BNBmodel.fit(X_train,y_train)

In [28]:
#y_pred=BNBmodel.predict(X_test)

In [29]:
#print(accuracy_score(y_test,y_pred))

In [30]:
def preprocess(data):
    #preprocess
    a = re.sub('[^a-zA-Z]',' ',data)
    a = a.lower()
    a = a.split()
    a = [a.lemmatize(word) for word in a ]
    a = ' '.join(a)  
    return a

In [31]:
# import re
# strr = input('Enter Your Message: ')
# print("-------------------------------")
# examples = strr
# import snscrape.modules.twitter as sntwitter

# query = "(from:Priyanshy_) until:2023-01-01 since:2014-01-01"
# tweets = []
# limits = 1
# for tweet in sntwitter.TwitterSearchScraper(query).get_items():
#     # print(vars(tweet))
#     # break
#     if len(tweets) == limits:
#         break
#     else:
#         tweets.append([tweet.content])
# df=pd.DataFrame(tweets,columns=['Tweet'])

# a = preprocess(examples)
# example_counts = vectorizer.transform([a])
# prediction =mnb.predict(example_counts)
# prediction[0]

# if prediction[0]==0:
#     print('Positive')
# elif prediction[0]==1:
#     print('Depressive')
    
# import re
# strr = input('Enter Your Message: ')
# print("-------------------------------")
# examples = strr
# a = preprocess_tweet_text(examples)
# #a = preprocess(examples)

# example_counts = vectorizer.transform([a])
# prediction =NB_model.predict(example_counts)
# #print(prediction)
# x = int(prediction)

# if x==1:
#     print('Depressive')
# elif x==0:
#     print('Non-Depressive')

In [32]:
import pickle
filename = 'vectorizer.pkl'
pickle.dump(vectorizer, open(filename, 'wb'))

In [33]:
filename = 'prediction.pkl'
pickle.dump(NB_model, open('prediction.pkl', 'wb'))

In [34]:
#model=pickle.load(open('saved_steps.pkl','rb'))

In [35]:
# dataa = {"model":NB_model}
# with open('saved_steps.pkl','wb') as file:
#     pickle.dump(dataa,file)

In [36]:
# with open('saved_steps.pkl','rb') as file:
#     dataa=pickle.load(file)
    
# classifierr = dataa["model"]

In [37]:
# pred1 = classifierr.predict(X_test)
# tc1 = NB_model.score(X_train,y_train)
# acc1 = accuracy_score(y_test,pred1)
# acc1