In [1]:
import pandas as pd
import numpy as np

import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from wordcloud import WordCloud
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

In [2]:
data = pd.read_csv('Train.csv')

In [3]:
data.head()

Unnamed: 0,ID,text,label
0,SUAVK39Z,I feel that it was better I dieAm happy,Depression
1,9JDAGUV3,Why do I get hallucinations?,Drugs
2,419WR1LQ,I am stresseed due to lack of financial suppor...,Depression
3,6UY7DX6Q,Why is life important?,Suicide
4,FYC0FTFB,How could I be helped to go through the depres...,Depression


In [4]:
test = pd.read_csv('Test.csv')

In [5]:
test.head()

Unnamed: 0,ID,text
0,02V56KMO,How to overcome bad feelings and emotions
1,03BMGTOK,I feel like giving up in life
2,03LZVFM6,I was so depressed feel like got no strength t...
3,0EPULUM5,I feel so low especially since I had no one to...
4,0GM4C5GD,can i be successful when I am a drug addict?


In [6]:
test.isnull().sum()

ID      0
text    0
dtype: int64

In [7]:
def remove_punctuation(text):
  return [word for word in text if word.isalpha()]

def remove_punctuation_from_word(text):
  token = []
  for word in text:
    if word[-1].isalpha():
      token.append(word)
    else:
      token.append(word[:-1]) 
  return token

stop_words = stopwords.words('english')
def remove_stopword(text):
  return [w for w in text if not w in stop_words]
    
def lemmatizing(text):
  lemmatizer = WordNetLemmatizer()
  return [lemmatizer.lemmatize(word) for word in text]

In [8]:
def preprocessText(df, raw_text):
    processed_text = df[raw_text]

    print('Converting to lower case...')
    processed_text = [text.strip().lower() for text in processed_text]
    print('Done')

    print('Tokenizing...')
    processed_text = [word_tokenize(text) for text in processed_text]
    print('Done')

    print('Removing punctuation...')
    processed_text = [remove_punctuation(text) for text in processed_text]
    processed_text = [remove_punctuation_from_word(text) for text in processed_text]
    print('Done')

    print('Removing Stop words...')
    processed_text = [remove_stopword(text) for text in processed_text]
    print('Done')

    print('Lemmatizing...')
    processed_text = [lemmatizing(text) for text in processed_text]
    
    processed_text = [' '.join(text) for text in processed_text]
    
    print('Text pre-processing Done, ', raw_text, '\n')
    return processed_text

In [9]:
data['processedText'] = preprocessText(data, 'text')

Converting to lower case...
Done
Tokenizing...
Done
Removing punctuation...
Done
Removing Stop words...
Done
Lemmatizing...
Text pre-processing Done,  text 



In [10]:
test['processedText'] = preprocessText(test, 'text')

Converting to lower case...
Done
Tokenizing...
Done
Removing punctuation...
Done
Removing Stop words...
Done
Lemmatizing...
Text pre-processing Done,  text 



In [11]:
X_train, y_train, X_test = data.processedText, data.label, test.processedText

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

svm = LinearSVC()


model = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', CalibratedClassifierCV(svm) ),
               ])
model.fit(X_train, y_train)

y_pred = model.predict(X_train)

print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

     Alcohol       0.99      0.99      0.99       140
  Depression       0.98      1.00      0.99       352
       Drugs       1.00      0.95      0.97        58
     Suicide       0.95      0.91      0.93        66

    accuracy                           0.98       616
   macro avg       0.98      0.96      0.97       616
weighted avg       0.98      0.98      0.98       616





In [13]:
y_proba = model.predict_proba(X_train)
log_loss(y_train, y_proba)

0.1976658187814614

In [14]:
y_proba = model.predict_proba(X_test)

In [23]:
prediction = pd.DataFrame(y_proba)

In [24]:
prediction.columns = ['Alcohol', 'Depression', 'Drugs', 'Suicide']

In [25]:
prediction.head()

Unnamed: 0,Alcohol,Depression,Drugs,Suicide
0,0.354542,0.510544,0.025,0.121794
1,0.025,0.925018,0.025,0.034107
2,0.025,0.967496,0.025,0.025
3,0.025,0.85561,0.042908,0.090649
4,0.21793,0.030657,0.714158,0.037255


In [26]:
prediction = pd.concat([test.ID, prediction], axis = 1)

In [27]:
prediction = prediction[['ID', 'Depression', 'Alcohol', 'Suicide', 'Drugs']]

In [28]:
prediction.head()

Unnamed: 0,ID,Depression,Alcohol,Suicide,Drugs
0,02V56KMO,0.510544,0.354542,0.121794,0.025
1,03BMGTOK,0.925018,0.025,0.034107,0.025
2,03LZVFM6,0.967496,0.025,0.025,0.025
3,0EPULUM5,0.85561,0.025,0.090649,0.042908
4,0GM4C5GD,0.030657,0.21793,0.037255,0.714158


In [29]:
prediction.to_csv('prediction3.csv', index=False)