### Import Libraries
Created by Serhii Zahranychnyi

In [1]:
import pickle
import re
import numpy as np
import pandas as pd
import nltk

nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

stop = stopwords.words('english')
porter = PorterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zagran/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Data Cleaning and Preprocessing

In [2]:
def clear_message_text(text):
    text = re.sub('<[^>]*>', '', str(text))
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower())
    text = text+' '.join(emoticons).replace('-', '') 
    return text


def tokenize_func(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\(|D|P)',text.lower())
    text = re.sub('[\W]+', ' ', text.lower())
    text += ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in tokenizer_porter(text) if w not in stop]
    return tokenized

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [3]:
df = pd.read_csv('suicidal_data_from_twitter.csv')
df['text'] = df['text'].apply(clear_message_text)

### Using the Hashing Vectorizer

In [4]:
from sklearn.feature_extraction.text import HashingVectorizer
vect = HashingVectorizer(decode_error='ignore', n_features=2**21, 
                         preprocessor=None,tokenizer=tokenize_func)

### Building the Model

In [5]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss='log', random_state=1)

In [6]:
X = df["text"].to_list()
y = df['label']

In [7]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [8]:
X_train = vect.transform(X_train)
X_test = vect.transform(X_test)

In [10]:
classes = np.array([0, 1])
clf.partial_fit(X_train, Y_train,classes=classes)

SGDClassifier(loss='log', random_state=1)

In [11]:
print('Accuracy: %.3f' % clf.score(X_test, Y_test))

Accuracy: 0.912


In [12]:
clf = clf.partial_fit(X_test, Y_test)

### Testing and making Predictions

In [13]:
label = {0:'negative', 1:'positive'}
example = ["I'll kill myself am tired of living depressed and alone"]
X = vect.transform(example)
print('Prediction: %s\nProbability: %.2f%%'
      %(label[clf.predict(X)[0]],np.max(clf.predict_proba(X))*100))

Prediction: positive
Probability: 93.75%


In [14]:
label = {0:'negative', 1:'positive'}
example = ["It's such a hot day, I'd like to have ice cream and visit the park"]
X = vect.transform(example)
print('Prediction: %s\nProbability: %.2f%%'
      %(label[clf.predict(X)[0]],np.max(clf.predict_proba(X))*100))

Prediction: negative
Probability: 97.91%


In [15]:
label = {0:'negative', 1:'positive'}
example = ["I'll never kill myself! I can live alone I think. That's no problem"]
X = vect.transform(example)
print('Prediction: %s\nProbability: %.2f%%'
      %(label[clf.predict(X)[0]],np.max(clf.predict_proba(X))*100))

Prediction: positive
Probability: 71.94%


# Save the model for later use

In [16]:
# save the model to disk
filename = 'finalized_model.sav'
pickle.dump(clf, open(filename, 'wb'))

# Load model and use

In [17]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, Y_test)
print(result)



0.9353070175438597
