In [1]:
# Importing essential libraries
import numpy as np
import pickle
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [2]:
# Loading the dataset
df = pd.read_csv('./data/tweets_combined.csv')
print(df.shape)

(1686, 4)


In [3]:
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,tweet,target
0,2432,2432,Avoid facial cleansers containing harsh chemic...,0
1,1069,1069,146 ???????????????????????????????????????...,0
2,1357,1357,You let me drown in my own tears which held ev...,0
3,1727,1727,Don't ever frown upon someone chasing a dream....,0
4,1950,1950,bro why is everyone so and blacked out? i was...,0


In [4]:
from nltk.stem import WordNetLemmatizer

In [5]:
lemmatizer = WordNetLemmatizer()

In [6]:
# Cleaning the reviews
corpus = []

# Create an object of PorterStemmer
ps = PorterStemmer()

for i in range(0, df.shape[0]):
    # Cleaning special character from the reviews
    review = re.sub(pattern='[^a-zA-Z]', repl=' ', string=df['tweet'][i])

    # convert entire review into lower case
    review = review.lower()

    # Tokenizing the review by words
    review_words = review.split()

    # Removing stopword then do stemming
    review_words = [lemmatizer.lemmatize(word) for word in review_words if word not in set(stopwords.words('english'))]

    # Joining the stemmed words
    review = ' '.join(review_words)

    # Creating a corpus
    corpus.append(review)
    
corpus

['avoid facial cleanser containing harsh chemical irritate weaken protective barrier skin use natural cleanser',
 '',
 'let drown tear held everything hated regretted loved could',
 'ever frown upon someone chasing dream remember chasing one humility',
 'bro everyone blacked gone hour hell broke loose',
 'little miss debbie downer',
 'drink numb pain away easier way way control',
 'timhortons please please please make jalape jack sandwich regular looove',
 'snorkel time fury key west',
 'mrosenbaum happy birthday awesome filled day awesomeness',
 'living downtown first time tiff happening schedule today excitement thechildrenact stanleytucci emmathompson',
 'wen u post one time pic twitter com l r v',
 'feel ever painful stigma prevents honesty problem let keep difficult conversation going x pic twitter com dwxfpvi ix',
 'gloomy day outside like',
 'perseveres good luck mercury award darling anohnitweets pic twitter com imvghq',
 'woke neighbor singing perfect ed sheeran mood set',
 'd

In [9]:
# Creating the Bag of Words model

from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer()
X = cv.fit_transform(corpus).toarray()
y = df.iloc[:, -1]

# Creating a pickle file for the CountVectorizer
# pickle.dump(cv, open('./data/cv-transform.pkl', 'wb'))

# Model Building
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

# Fitting Naive Bayes to the Training set
classifier = MultinomialNB(alpha=0.2)
classifier.fit(X_train, y_train)

# Creating a pickle file for the Multinomial Naive Bayes model
filename = './data/depression2.pkl'
# pickle.dump(classifier, open(filename, 'wb'))


In [10]:
score = classifier.score(X_test, y_test)
score

0.7633136094674556

In [11]:
y_pred = classifier.predict(X_test)

In [12]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.66      0.73       166
           1       0.73      0.86      0.79       172

    accuracy                           0.76       338
   macro avg       0.77      0.76      0.76       338
weighted avg       0.77      0.76      0.76       338



In [None]:
df['target'].value_counts()

In [None]:
filename = 'depression2.pkl'
classifier1 = pickle.load(open(filename, 'rb'))
cv1 = pickle.load(open('cv-transform.pkl','rb'))

In [None]:
score2 = classifier1.score(X_test, y_test)
score2

In [13]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import GridSearchCV

In [14]:
model_params = {
    'svm': {
        'model': SVC(),
        'params' : {
            'C': [0.1, 1, 10, 100, 1000],
            'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
            'kernel': ['rbf', 'linear']
        }  
    },
    'navie_bayes': {
        'model': GaussianNB(),
        'params' : {
            'var_smoothing': np.logspace(0,-9, num=100)
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    }
}

In [15]:
scores = []
best_estimators = {}
import pandas as pd
for algo, mp in model_params.items():
    pipe = make_pipeline(StandardScaler(), mp['model'])
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(X_train, y_train)
    scores.append({
        'model': algo,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    best_estimators[algo] = clf.best_estimator_

In [16]:
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.754443,"{'C': 1, 'gamma': 1, 'kernel': 'linear'}"
1,navie_bayes,0.634988,{'var_smoothing': 0.01873817422860384}
2,logistic_regression,0.749233,{'C': 5}
