In [1]:
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV

In [2]:
data = pd.read_csv('dataset.csv')

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,statement,status
0,0,oh my gosh,Anxiety
1,1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,3,I've shifted my focus to something else but I'...,Anxiety
4,4,"I'm restless and restless, it's been a month n...",Anxiety


In [66]:
data['status'].value_counts()

status
Normal                  16343
Depression              15404
Suicidal                10652
Anxiety                  3841
Bipolar                  2777
Stress                   2587
Personality disorder     1077
Name: count, dtype: int64

In [4]:
data.drop(columns='Unnamed: 0', inplace=True)

In [5]:
data.shape

(53043, 2)

In [6]:
data.isnull().sum()

statement    362
status         0
dtype: int64

In [7]:
data.dropna(inplace=True)

In [8]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [9]:
data['statement'][0].lower()

'oh my gosh'

In [10]:
statement = []
for value in data['statement']:
    value = value.lower()
    value = ''.join([char for char in value if char not in string.punctuation])
    statement.append(value)
    

In [11]:
data['Statement']=statement

In [12]:
data.head()

Unnamed: 0,statement,status,Statement
0,oh my gosh,Anxiety,oh my gosh
1,"trouble sleeping, confused mind, restless hear...",Anxiety,trouble sleeping confused mind restless heart ...
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety,all wrong back off dear forward doubt stay in ...
3,I've shifted my focus to something else but I'...,Anxiety,ive shifted my focus to something else but im ...
4,"I'm restless and restless, it's been a month n...",Anxiety,im restless and restless its been a month now ...


In [13]:
data.drop(columns='statement', inplace=True)

In [14]:
data['status'].value_counts()

status
Normal                  16343
Depression              15404
Suicidal                10652
Anxiety                  3841
Bipolar                  2777
Stress                   2587
Personality disorder     1077
Name: count, dtype: int64

In [15]:
data.head()

Unnamed: 0,status,Statement
0,Anxiety,oh my gosh
1,Anxiety,trouble sleeping confused mind restless heart ...
2,Anxiety,all wrong back off dear forward doubt stay in ...
3,Anxiety,ive shifted my focus to something else but im ...
4,Anxiety,im restless and restless its been a month now ...


In [16]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['status'])

In [17]:
y.shape

(52681,)

In [18]:
vectorizer = TfidfVectorizer(stop_words='english')
x = vectorizer.fit_transform(data['Statement'])

In [19]:
x.shape

(52681, 78316)

In [20]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [21]:
x_train.shape

(42144, 78316)

In [22]:
x_test.shape

(10537, 78316)

In [23]:
y_train.shape

(42144,)

In [24]:
y_test.shape

(10537,)

In [25]:
smote = SMOTE(random_state=42)
x_train_resample, y_train_resample = smote.fit_resample(x_train, y_train)

In [26]:
x_train_resample.shape

(91245, 78316)

In [27]:
y_train_resample.shape

(91245,)

In [28]:
model = SVC(kernel='linear', decision_function_shape='ovr')
model.fit(x_train_resample, y_train_resample)


In [29]:
y_predict = model.predict(x_test)

In [30]:
report = classification_report(y_predict, y_test)
print(report)

              precision    recall  f1-score   support

           0       0.84      0.74      0.78       853
           1       0.80      0.81      0.80       516
           2       0.65      0.74      0.70      2651
           3       0.91      0.89      0.90      3356
           4       0.62      0.76      0.69       194
           5       0.66      0.59      0.62       596
           6       0.72      0.66      0.69      2371

    accuracy                           0.77     10537
   macro avg       0.74      0.74      0.74     10537
weighted avg       0.77      0.77      0.77     10537



In [31]:
accuracy = accuracy_score(y_predict, y_test)

In [64]:
accuracy

0.7665369649805448