## Constants

In [1]:
import re
import string;
from nltk.corpus import stopwords

EMOJI_PATTERN=re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002500-\U00002BEF"  # chinese char
                           u"\U00002702-\U000027B0"
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           u"\U0001f926-\U0001f937"
                           u"\U00010000-\U0010ffff"
                           u"\u2640-\u2642"
                           u"\u2600-\u2B55"
                           u"\u200d"
                           u"\u23cf"
                           u"\u23e9"
                           u"\u231a"
                           u"\ufe0f"  # dingbats
                           u"\u3030"
                           "]+", re.UNICODE);
PUNCT_TO_REMOVE = string.punctuation.replace("'", "")
STOP_WORDS=stopwords.words("english")
STOP_WORDS.extend(["'s","'m","'ve","s","#"])

TRAIN_SPLIT = 0.6
TEST_SPLIT = 0.2
LEARNING_RATE = 0.1
EPOCHS= 10

## Import libraries and datasets

In [2]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import re
import spacy
import en_core_web_sm

from nltk.tokenize import word_tokenize, RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer

from bs4 import BeautifulSoup
from html import unescape

reddit_suidiceWatch = pd.read_csv('../data/reddit_depression_suicidewatch.csv') #depression and SuicideWatch. long texts (limit it somehow). row=20365
# depressive_tweets = pd.read_csv('depressive_tweets_processed.csv', sep = '|', header = None, usecols = range(0,9), nrows = 3200) #all depressed, no label needed. row= 4078
tweet_mental_health_classification = pd.read_csv('../data/tweet-mental-health-classification-train.csv') # stressed, anxious, ..... first column, no column name. row= 1.048.575
training160000 = pd.read_csv('../data/training.1600000.processed.noemoticon.csv',encoding='latin-1') # 0=negative, 2=notr, 4=positive. first column, no column name. row= 1.048.575

training160000.drop(columns=training160000.columns[1:5], axis=1, inplace=True)
column_names=["labels",'tweets']
training160000.columns = column_names
new_cols = ["tweets","labels"]
training160000=training160000.reindex(columns=new_cols)

# depressive_tweets.drop(columns=depressive_tweets.columns[-3:], axis=1, inplace=True)
# depressive_tweets.drop(columns=depressive_tweets.columns[0:5], axis=1, inplace=True)

# column_names_d=['tweet']
# depressive_tweets.columns = column_names_d

nlp = en_core_web_sm.load()

## Inspect and Split Datasets

### 1.training160000 (positive, negative)

In [3]:
positive_training160000 = training160000[training160000['labels'] == 4]
DF_POSITIVE = positive_training160000
print("positive size:")
DF_POSITIVE['labels'] = "Positive"
print(len(DF_POSITIVE))
DF_POSITIVE.head(3)

positive size:
800000


Unnamed: 0,tweets,labels
799999,I LOVE @Health4UandPets u guys r the best!!,Positive
800000,im meeting up with one of my besties tonight! ...,Positive
800001,"@DaRealSunisaKim Thanks for the Twitter add, S...",Positive


### 2.reddit_suidiceWatch (Depression, Suidical)

In [4]:
def divideRedditTextToTweetLength(paragraph):
    lines = []
    line = ''
    if(len(line) + len(paragraph) + 1 < 240):
        return [paragraph]
    for sentence in (s.strip()+'.' for s in paragraph.split('.')[:-1]):
        if len(line) + len(sentence) + 1 >= 240: #overfitting.
            lines.append(line)
            line = sentence
        else:                                   
            line += ' ' + sentence  
    print(lines)
    return lines

In [5]:
# no time:( suicidal_reddit_list = [divideRedditTextToTweetLength(paragraph) for paragraph in suicidal_reddit['tweets'].tolist()]

suicidal_reddit = reddit_suidiceWatch[reddit_suidiceWatch['label'] == 'SuicideWatch']
DF_SUICIDAL = suicidal_reddit
DF_SUICIDAL['label'] = "Suicidal"
DF_SUICIDAL.rename(columns = {'text':'tweets','label':'labels'}, inplace = True)
DF_SUICIDAL['tweets'] = DF_SUICIDAL['tweets'].str.slice(0,240) # paragrafı parçalara ayırmak yerine ilk 240ı almayı tercih ettik (şimdilik)

print("depressive_reddit_suicide size:")
print(len(DF_SUICIDAL))
DF_SUICIDAL.head(3)

depressive_reddit_suicide size:
9992


Unnamed: 0,tweets,labels
3,I am so exhausted of this. Just when I think I...,Suicidal
5,I am 20 year old with some good friends but I ...,Suicidal
8,it is looming around the corner again. It alwa...,Suicidal


In [6]:
depressive_reddit = reddit_suidiceWatch[reddit_suidiceWatch['label'] == 'depression']
print("depressive_reddit_depression size:")
depressive_reddit.rename(columns = {'text':'tweets','label':'labels'}, inplace = True)
depressive_reddit['labels'] = "Depressive"
print(len(depressive_reddit))
depressive_reddit

depressive_reddit_depression size:
10371


Unnamed: 0,tweets,labels
0,I recently went through a breakup and she said...,Depressive
1,"I do not know how to navigate these feelings, ...",Depressive
2,"So I have been with my bf for 5 months , and h...",Depressive
4,I have been severly bullied since i was 5 till...,Depressive
6,My mom made me go to a camp that she knows I h...,Depressive
...,...,...
20355,cannot even decide where to start. Low self es...,Depressive
20359,that is what has happened to me last week. And...,Depressive
20360,Ever just feel alone in a house full of people...,Depressive
20361,Politicians. Neighbors. Corporations. Society....,Depressive


In [7]:
type(suicidal_reddit['tweets'])


pandas.core.series.Series

### 3.depressive_tweets

In [14]:
print(len(depressive_tweets))
# print(depressive_tweets)
# print(depressive_reddit)

depressive_tweets['label']='Depressive'
depressive_tweets.rename(columns = {'tweet':'tweets','label':'labels'}, inplace = True)
depressive_tweets

NameError: name 'depressive_tweets' is not defined

#### merge depressive_tweets and depressive_reddit


In [8]:
# DF_DEPRESSIVE=pd.concat([depressive_reddit,depressive_tweets],axis=0)
DF_DEPRESSIVE=depressive_reddit
DF_DEPRESSIVE['tweets'] = DF_DEPRESSIVE['tweets'].str.slice(0,240)
print(len(DF_DEPRESSIVE))
DF_DEPRESSIVE.head(3)

10371


Unnamed: 0,tweets,labels
0,I recently went through a breakup and she said...,Depressive
1,"I do not know how to navigate these feelings, ...",Depressive
2,"So I have been with my bf for 5 months , and h...",Depressive


### tweet_mental_health_classification (Stressed, Anxious, Normal, Lonely)

In [9]:
DF_STRESSED = tweet_mental_health_classification[tweet_mental_health_classification['labels'] == 'Stressed']
print(len(DF_STRESSED))
DF_STRESSED['tweets'].head(5)

6840


0     sending solidarity whoever doctor manage incre...
11                            feel beautiful woman sad 
18    tire fight foreign occupier fight defend homel...
21    boy get slaughter helpless mother could scream...
27    sad thing disinformation truth come damage alr...
Name: tweets, dtype: object

In [10]:
DF_ANXIOUS = tweet_mental_health_classification[tweet_mental_health_classification['labels'] == 'Anxious']
print(len(DF_ANXIOUS))
DF_ANXIOUS.head(5)

8388


Unnamed: 0,tweets,labels
1,need see hair amp beard gat book appointment b...,Anxious
6,good point remember 1013 leave alone pokie clu...,Anxious
8,okay ik lot people go want im make gc ashnikko...,Anxious
14,ron desantis danger florida democracy want pre...,Anxious
19,fact get mecha work boy girl doggy style posit...,Anxious


In [11]:
DF_NORMAL = tweet_mental_health_classification[tweet_mental_health_classification['labels'] == 'Normal']
print(len(DF_NORMAL))
DF_NORMAL.head(5)

7973


Unnamed: 0,tweets,labels
2,next time meet someone new dont ask ask love,Normal
4,raise hand junhoes ocean lotion life rent free...,Normal
5,mariposa de barrio teach matter guy forever ch...,Normal
7,mori wip white dress,Normal
9,im gonna say,Normal


In [12]:
DF_LONELY = tweet_mental_health_classification[tweet_mental_health_classification['labels'] == 'Lonely']
print(len(DF_LONELY))
DF_LONELY.head(5)

6788


Unnamed: 0,tweets,labels
3,surprise someone love give la senza gift box r...,Lonely
10,people suffer know difficult mourn one person ...,Lonely
15,2021 im bring back tune 20142016 era aka actua...,Lonely
16,base abuja get 5years experience office admini...,Lonely
20,guess need feel comfort biden protect us evil ...,Lonely


## Create corpus

In [13]:
CORPUS=pd.concat([DF_LONELY,DF_NORMAL,DF_ANXIOUS,DF_STRESSED,DF_DEPRESSIVE,DF_SUICIDAL,DF_POSITIVE],axis=0)
CORPUS.labels.value_counts()

Positive      800000
Depressive     10371
Suicidal        9992
Anxious         8388
Normal          7973
Stressed        6840
Lonely          6788
Name: labels, dtype: int64

## split test and train data

In [14]:
from sklearn.model_selection import train_test_split


In [30]:
DF_LONELY_Xtrain, DF_LONELY_Xtest, DF_LONELY_Ytrain, DF_LONELY_Ytest = train_test_split(
    DF_LONELY.tweets,
    DF_LONELY.labels,
    test_size=0.2,
    random_state=2022,
    stratify=DF_LONELY.labels)
print(len(DF_LONELY_Xtrain), len(DF_LONELY_Ytrain), len(DF_LONELY_Xtest), len(DF_LONELY_Ytest))

DF_NORMAL_Xtrain, DF_NORMAL_Xtest, DF_NORMAL_Ytrain, DF_NORMAL_Ytest = train_test_split(
    DF_NORMAL.tweets,
    DF_NORMAL.labels,
    test_size=0.2,
    random_state=2022,
    stratify=DF_NORMAL.labels)
print(len(DF_NORMAL_Xtrain), len(DF_NORMAL_Ytrain), len(DF_NORMAL_Xtest), len(DF_NORMAL_Ytest))

DF_ANXIOUS_Xtrain, DF_ANXIOUS_Xtest, DF_ANXIOUS_Ytrain, DF_ANXIOUS_Ytest = train_test_split(
    DF_ANXIOUS.tweets,
    DF_ANXIOUS.labels,
    test_size=0.2,
    random_state=2022,
    stratify=DF_ANXIOUS.labels)
print(len(DF_ANXIOUS_Xtrain), len(DF_ANXIOUS_Ytrain), len(DF_ANXIOUS_Xtest), len(DF_ANXIOUS_Ytest))

DF_STRESSED_Xtrain, DF_STRESSED_Xtest, DF_STRESSED_Ytrain, DF_STRESSED_Ytest = train_test_split(
    DF_STRESSED.tweets,
    DF_STRESSED.labels,
    test_size=0.2,
    random_state=2022,
    stratify=DF_STRESSED.labels)
print(len(DF_STRESSED_Xtrain), len(DF_STRESSED_Ytrain), len(DF_STRESSED_Xtest), len(DF_STRESSED_Ytest))

DF_DEPRESSIVE_Xtrain, DF_DEPRESSIVE_Xtest, DF_DEPRESSIVE_Ytrain, DF_DEPRESSIVE_Ytest = train_test_split(
    DF_DEPRESSIVE.tweets,
    DF_DEPRESSIVE.labels,
    test_size=0.2,
    random_state=2022,
    stratify=DF_DEPRESSIVE.labels)
print(len(DF_DEPRESSIVE_Xtrain), len(DF_DEPRESSIVE_Ytrain), len(DF_DEPRESSIVE_Xtest), len(DF_DEPRESSIVE_Ytest))

DF_SUICIDAL_Xtrain, DF_SUICIDAL_Xtest, DF_SUICIDAL_Ytrain, DF_SUICIDAL_Ytest = train_test_split(
    DF_SUICIDAL.tweets,
    DF_SUICIDAL.labels,
    test_size=0.2,
    random_state=2022,
    stratify=DF_SUICIDAL.labels)
print(len(DF_SUICIDAL_Xtrain), len(DF_SUICIDAL_Ytrain), len(DF_SUICIDAL_Xtest), len(DF_SUICIDAL_Ytest))

DF_POSITIVE_Xtrain, DF_POSITIVE_Xtest, DF_POSITIVE_Ytrain, DF_POSITIVE_Ytest = train_test_split(
    DF_POSITIVE.tweets,
    DF_POSITIVE.labels,
    test_size=0.2,
    random_state=2022,
    stratify=DF_POSITIVE.labels)
print(len(DF_POSITIVE_Xtrain), len(DF_POSITIVE_Ytrain), len(DF_POSITIVE_Xtest), len(DF_POSITIVE_Ytest))

5430 5430 1358 1358
6378 6378 1595 1595
6710 6710 1678 1678
5472 5472 1368 1368
8296 8296 2075 2075
7993 7993 1999 1999
640000 640000 160000 160000


In [34]:
default_train_size=15000
default_test_size=3000
positive_train_size=40000
positive_test_size=8000

DF_POSITIVE_Xtrain_under = DF_POSITIVE_Xtrain.sample(positive_train_size)
DF_POSITIVE_Ytrain_under = DF_POSITIVE_Ytrain.sample(positive_train_size)
DF_POSITIVE_Xtest_under = DF_POSITIVE_Xtest.sample(positive_test_size)
DF_POSITIVE_Ytest_under = DF_POSITIVE_Ytest.sample(positive_test_size)
print(DF_POSITIVE_Ytrain_under[:2])

DF_SUICIDAL_Xtrain_over = DF_SUICIDAL_Xtrain.sample(default_train_size,replace=True)
DF_SUICIDAL_Ytrain_over = DF_SUICIDAL_Ytrain.sample(default_train_size,replace=True)
DF_SUICIDAL_Xtest_over = DF_SUICIDAL_Xtest.sample(default_test_size,replace=True)
DF_SUICIDAL_Ytest_over = DF_SUICIDAL_Ytest.sample(default_test_size,replace=True)
print(DF_SUICIDAL_Ytrain_over[:2])

DF_DEPRESSIVE_Xtrain_over = DF_DEPRESSIVE_Xtrain.sample(default_train_size,replace=True)
DF_DEPRESSIVE_Ytrain_over = DF_DEPRESSIVE_Ytrain.sample(default_train_size,replace=True)
DF_DEPRESSIVE_Xtest_over = DF_DEPRESSIVE_Xtest.sample(default_test_size,replace=True)
DF_DEPRESSIVE_Ytest_over = DF_DEPRESSIVE_Ytest.sample(default_test_size,replace=True)

DF_STRESSED_Xtrain_over = DF_STRESSED_Xtrain.sample(default_train_size,replace=True)
DF_STRESSED_Ytrain_over = DF_STRESSED_Ytrain.sample(default_train_size,replace=True)
DF_STRESSED_Xtest_over = DF_STRESSED_Xtest.sample(default_test_size,replace=True)
DF_STRESSED_Ytest_over = DF_STRESSED_Ytest.sample(default_test_size,replace=True)

DF_ANXIOUS_Xtrain_over = DF_ANXIOUS_Xtrain.sample(default_train_size,replace=True)
DF_ANXIOUS_Ytrain_over = DF_ANXIOUS_Ytrain.sample(default_train_size,replace=True)
DF_ANXIOUS_Xtest_over = DF_ANXIOUS_Xtest.sample(default_test_size,replace=True)
DF_ANXIOUS_Ytest_over = DF_ANXIOUS_Ytest.sample(default_test_size,replace=True)

DF_NORMAL_Xtrain_over = DF_NORMAL_Xtrain.sample(default_train_size,replace=True)
DF_NORMAL_Ytrain_over = DF_NORMAL_Ytrain.sample(default_train_size,replace=True)
DF_NORMAL_Xtest_over = DF_NORMAL_Xtest.sample(default_test_size,replace=True)
DF_NORMAL_Ytest_over = DF_NORMAL_Ytest.sample(default_test_size,replace=True)

DF_LONELY_Xtrain_over = DF_LONELY_Xtrain.sample(default_train_size,replace=True)
DF_LONELY_Ytrain_over = DF_LONELY_Ytrain.sample(default_train_size,replace=True)
DF_LONELY_Xtest_over = DF_LONELY_Xtest.sample(default_test_size,replace=True)
DF_LONELY_Ytest_over = DF_LONELY_Ytest.sample(default_test_size,replace=True)

type(DF_LONELY_Ytest_over)

1278146    Positive
956214     Positive
Name: labels, dtype: object
18626    Suicidal
10708    Suicidal
Name: labels, dtype: object


  print(DF_POSITIVE_Ytrain_under[:2])
  print(DF_SUICIDAL_Ytrain_over[:2])


pandas.core.series.Series

In [37]:
X_train = pd.concat([DF_POSITIVE_Xtrain_under,DF_SUICIDAL_Xtrain_over,DF_DEPRESSIVE_Xtrain_over,
           DF_STRESSED_Xtrain_over,DF_ANXIOUS_Xtrain_over,DF_NORMAL_Xtrain_over,DF_LONELY_Xtrain_over])
Y_train = pd.concat([DF_POSITIVE_Ytrain_under,DF_SUICIDAL_Ytrain_over,DF_DEPRESSIVE_Ytrain_over,
           DF_STRESSED_Ytrain_over,DF_ANXIOUS_Ytrain_over,DF_NORMAL_Ytrain_over,DF_LONELY_Ytrain_over])
X_test = pd.concat([DF_POSITIVE_Xtest_under,DF_SUICIDAL_Xtest_over,DF_DEPRESSIVE_Xtest_over,
           DF_STRESSED_Xtest_over,DF_ANXIOUS_Xtest_over,DF_NORMAL_Xtest_over,DF_LONELY_Xtest_over])
Y_test = pd.concat([DF_POSITIVE_Ytest_under,DF_SUICIDAL_Ytest_over,DF_DEPRESSIVE_Ytest_over,
           DF_STRESSED_Ytest_over,DF_ANXIOUS_Ytest_over,DF_NORMAL_Ytest_over,DF_LONELY_Ytest_over])
print(Y_train)

1278146    Positive
956214     Positive
974637     Positive
1532788    Positive
1081964    Positive
             ...   
8672         Lonely
25101        Lonely
14497        Lonely
15707        Lonely
14497        Lonely
Name: labels, Length: 130000, dtype: object


In [41]:
X_train=X_train.reset_index(drop=True)
X_test=X_test.reset_index(drop=True)
Y_train=Y_train.reset_index(drop=True)
Y_test=Y_test.reset_index(drop=True)

print(X_test)


0        @catiams  oh bummer! :S how are your exams goi...
1               @Andie02 get ready for the beach tomorrow 
2            today is going to be fun! party all day long 
3             news flash!! alxm5 just sold his iphone lol 
4        @Vosty Morning Wormie  Been here for a while, ...
                               ...                        
25995             hard day love alive thats huge progress 
25996    people suffer know difficult mourn one person ...
25997    love people love cause im type show love right...
25998    advisor tell instead try game market need put ...
25999    might get treat vote video entry celebrate lov...
Name: tweets, Length: 26000, dtype: object


In [42]:
print(X_train[:3])
print(Y_train[:3])
print(X_test[:3])
print(Y_test[:3])

0                  hey twiter, I'm back..! 
1    Good afternoon @SwaqqedOutMissy  baby 
2                  @damonDCclark Wallace's 
Name: tweets, dtype: object
0    Positive
1    Positive
2    Positive
Name: labels, dtype: object
0    @catiams  oh bummer! :S how are your exams goi...
1           @Andie02 get ready for the beach tomorrow 
2        today is going to be fun! party all day long 
Name: tweets, dtype: object
0    Positive
1    Positive
2    Positive
Name: labels, dtype: object


Görüldüğü üzere oranlar çok dengesiz. Dengelenmeye ihtiyacı var

## Preprocessing and Balancing

In [44]:
def preprocess(tweet):
#     print(tweet)
    ans = tweet.lower() # lowercase
    ans = re.sub('@[^\s]+', '', ans)   # remove mentions
    soup = BeautifulSoup(unescape(ans), 'lxml')
    ans = soup.text
#     print("after -----------")
    ans = EMOJI_PATTERN.sub(r'', ans)  #remove emojis TODO: remove also symbols like :) :D vs.
    ans = re.sub(r'http\S+', '', ans)  # remove links
    ans = " ".join(ans.split())        # remove unnecessarily spaces
    
    data = nlp(ans)
    filtered_tokens=[]
    for token in data:
        if token.is_stop or token.is_punct:
            continue;
        filtered_tokens.append(token.lemma_) #lemmatizes token

    ans = " ".join(filtered_tokens) 
    
#     print(ans)
    return ans

In [52]:
X_train_processed = X_train.copy()
tqdm.pandas()
X_train_processed = X_train_processed.progress_apply(preprocess)

100%|█████████████████████████████████████████████████████████████████████████| 130000/130000 [14:20<00:00, 151.00it/s]


In [48]:
X_test_processed = X_test.copy()
tqdm.pandas()
X_test_processed = X_test_processed.progress_apply(preprocess)

100%|███████████████████████████████████████████████████████████████████████████| 26000/26000 [02:45<00:00, 156.86it/s]


## Split test and train data

In [47]:
X_train,Y_train

(0                                  hey twiter, I'm back..! 
 1                    Good afternoon @SwaqqedOutMissy  baby 
 2                                  @damonDCclark Wallace's 
 3                        @junkfoodfm Ahah sympa le concept 
 4         got my fill of great sun, great food, great fr...
                                 ...                        
 129995    human need job cant exist amp make art chill cat 
 129996              whyd let feel lonely love hard darkest 
 129997                                           need help 
 129998    alright go seeing 53 spoiler ban lift lost tho...
 129999    free vice b26c4628 battle id need backup lvl 1...
 Name: tweets, Length: 130000, dtype: object,
 0         Positive
 1         Positive
 2         Positive
 3         Positive
 4         Positive
             ...   
 129995      Lonely
 129996      Lonely
 129997      Lonely
 129998      Lonely
 129999      Lonely
 Name: labels, Length: 130000, dtype: object)

In [51]:
X_train_processed

0                                  hey twiter, I'm back..! 
1                    Good afternoon @SwaqqedOutMissy  baby 
2                                  @damonDCclark Wallace's 
3                        @junkfoodfm Ahah sympa le concept 
4         got my fill of great sun, great food, great fr...
                                ...                        
129995    human need job cant exist amp make art chill cat 
129996              whyd let feel lonely love hard darkest 
129997                                           need help 
129998    alright go seeing 53 spoiler ban lift lost tho...
129999    free vice b26c4628 battle id need backup lvl 1...
Name: tweets, Length: 130000, dtype: object

# Train

MultinomialNB | Unigram | Processed data

In [53]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

classifier_mb1=Pipeline([
    ('vectorizer_bow',CountVectorizer()),
    ('Multi NB', MultinomialNB())
])
classifier_mb1.fit(X_train,Y_train)
Y_pred= classifier_mb1.predict(X_test)
print(classification_report(Y_test,Y_pred))


              precision    recall  f1-score   support

     Anxious       0.34      0.45      0.39      3000
  Depressive       0.59      0.67      0.63      3000
      Lonely       0.39      0.41      0.40      3000
      Normal       0.85      0.38      0.53      3000
    Positive       0.93      0.92      0.93      8000
    Stressed       0.83      0.85      0.84      3000
    Suicidal       0.60      0.64      0.62      3000

    accuracy                           0.68     26000
   macro avg       0.65      0.62      0.62     26000
weighted avg       0.70      0.68      0.68     26000



In [56]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

classifier_mb2=Pipeline([
    ('vectorizer_bow',CountVectorizer(ngram_range=(1,2))),
    ('Multi NB', MultinomialNB())
])
classifier_mb2.fit(X_train,Y_train)
Y_pred= classifier_mb2.predict(X_test)
print(classification_report(Y_test,Y_pred))

              precision    recall  f1-score   support

     Anxious       0.31      0.40      0.35      3000
  Depressive       0.59      0.68      0.63      3000
      Lonely       0.37      0.41      0.39      3000
      Normal       0.93      0.31      0.46      3000
    Positive       0.91      0.91      0.91      8000
    Stressed       0.86      0.88      0.87      3000
    Suicidal       0.60      0.66      0.63      3000

    accuracy                           0.66     26000
   macro avg       0.65      0.61      0.61     26000
weighted avg       0.70      0.66      0.66     26000



In [55]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

classifier_mb3=Pipeline([
    ('vectorizer_bow',CountVectorizer(ngram_range=(1,3))),
    ('Multi NB', MultinomialNB())
])
classifier_mb3.fit(X_train,Y_train)
Y_pred= classifier_mb3.predict(X_test)
print(classification_report(Y_test,Y_pred))

              precision    recall  f1-score   support

     Anxious       0.31      0.40      0.35      3000
  Depressive       0.57      0.67      0.61      3000
      Lonely       0.37      0.41      0.39      3000
      Normal       0.95      0.27      0.42      3000
    Positive       0.89      0.90      0.90      8000
    Stressed       0.87      0.87      0.87      3000
    Suicidal       0.58      0.64      0.61      3000

    accuracy                           0.65     26000
   macro avg       0.65      0.60      0.59     26000
weighted avg       0.69      0.65      0.65     26000



In [57]:
manual_test_list=["I want to kill myself",
                  "awesome, everything is okay",
                  "thank you so much",
                  "moving another country",
                  "i hate all of you and going to kill myself",
                  "what is the point of living?",
                  "death is everywhere",
                  "overcome depression",
                  "song is great",
                  "i saw a sad woman"]

sonuc1=classifier_mb2.predict(manual_test_list)
print(sonuc1)


['Suicidal' 'Positive' 'Positive' 'Depressive' 'Suicidal' 'Suicidal'
 'Suicidal' 'Depressive' 'Positive' 'Stressed']


In [58]:
sonuc_list = [preprocess(text) for text in manual_test_list]

sonuc1=classifier_mb3.predict(sonuc_list)
print(sonuc1)


['Suicidal' 'Positive' 'Positive' 'Stressed' 'Suicidal' 'Suicidal'
 'Suicidal' 'Depressive' 'Positive' 'Stressed']


## Export modal


In [60]:
import joblib
classifier_file=open("mental_illness_detector.pkl","wb")
joblib.dump(classifier_mb2,classifier_file)
classifier_file.close()