In [1]:
import pandas as pd 
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score

In [2]:
df = pd.read_csv('https://www.dropbox.com/s/inkzg6vb5cnnz5c/modeling.csv?dl=1')
dfsw = pd.read_csv('https://www.dropbox.com/s/ye1qsevhhmat9kx/modeling_with_stopwords.csv?dl=1')
df = df.rename(columns = {'posts':'posts_from_user','suicidal':'user_is_suicidal'})
dfsw = dfsw.rename(columns = {'posts':'posts_from_user','suicidal':'user_is_suicidal'})

In [3]:
df.head()

Unnamed: 0,text,posts_from_user,word_count,day_of_week,SPACE__SP,AUX_VBP,PRON_PRP,ADV_RB,VERB_VB,SCONJ_IN,...,AUX_:,AUX_-LRB-,VERB_NNS,AUX_HYPH,AUX_PRP,ADV_CC,AUX_.,AUX_XX,AUX_WRB,user_is_suicidal
0,ever feel like everyone watch start overthink ...,8,400,2,4.76,2.75,11.36,3.66,6.59,3.3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,compare people hate make feel like you re enou...,8,427,4,5.91,1.39,10.43,5.22,6.78,2.96,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,find apologize time even thing be not really f...,8,379,4,5.11,2.46,10.8,3.6,6.06,0.76,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,read understand free soul today ill talk anoth...,8,377,2,5.59,1.12,6.15,3.35,3.35,1.68,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,be not little bit tired people tell think posi...,8,467,2,4.57,1.47,8.65,5.22,7.01,1.96,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [4]:
df.isna().sum()

text                0
posts_from_user     0
word_count          0
day_of_week         0
SPACE__SP           0
                   ..
ADV_CC              0
AUX_.               0
AUX_XX              0
AUX_WRB             0
user_is_suicidal    0
Length: 95, dtype: int64

In [20]:


def modeling_df(df,v_type='tfidf',min_ngram = 1, max_ngram = 3, scale = True, join = True):

    train, test = train_test_split(df,stratify=df['user_is_suicidal'],test_size=.3, random_state=42)
    # if scale:
    #     scaler = StandardScaler()
    #     scaler.fit_transform(train)
    #     scaler.transform(test)
    print(train['user_is_suicidal'].unique())
    print(test['user_is_suicidal'].unique())
    if v_type == 'tfidf':
        vectorizer = TfidfVectorizer(min_df=50, max_df=.60, ngram_range=(min_ngram,max_ngram))
    elif v_type == 'count':
        vectorizer = CountVectorizer(min_df=50, max_df=.60, ngram_range=(min_ngram,max_ngram))
    else:
        raise Exception("v_type must be 'tfidf' or 'count'")
    text_matrix = vectorizer.fit_transform(train['text'])
    word_features = pd.DataFrame(text_matrix.toarray(),columns = vectorizer.get_feature_names_out()).reset_index(drop=True)
    print(word_features.shape)
    if join:
        other_features = train.iloc[:, 1:].reset_index(drop=True)
        modeling_features = word_features.join(other_features)
    else:
        modeling_features = word_features
    test_text_matrix = vectorizer.transform(test['text'])
    test_word_features = pd.DataFrame(test_text_matrix.toarray(),columns = vectorizer.get_feature_names_out()).reset_index(drop=True)
    if join:
        test_other_features = test.iloc[:, 1:].reset_index(drop=True)
        test_modeling_features = test_word_features.join(test_other_features)
    else:
        test_modeling_features = test_word_features
    if join:
        X_train, y_train, X_test, y_test = modeling_features.iloc[:, :-1], modeling_features.iloc[:, -1], test_modeling_features.iloc[:, :-1],test_modeling_features.iloc[:, -1]
    else:
        print('not join')
        X_train, y_train, X_test, y_test = modeling_features, train['user_is_suicidal'],  test_modeling_features, test['user_is_suicidal']
    print(y_train.unique(),y_test.unique())
    if scale:
        scaler = StandardScaler()
        scaler.fit(X_train)
        X_train = pd.DataFrame(scaler.transform(X_train),columns= X_train.columns)
        X_test = pd.DataFrame(scaler.transform(X_test),columns= X_test.columns)
    return X_train, y_train, X_test, y_test 


# Count Vectorizor With Logistic Regression

In [13]:
X_train, y_train, X_test, y_test = modeling_df(df=dfsw, v_type ='count',scale = False)
clf = LogisticRegression(random_state=42, solver='saga').fit(X_train, y_train)
predictions = clf.predict(X_test)
print(classification_report(y_test, predictions))
print("Precision score: {}".format(precision_score(y_test, predictions)))
print("Recall score: {}".format(recall_score(y_test, predictions)))
print("F1 score: {}".format(f1_score(y_test, predictions)))

[1 0]
[1 0]
[1 0] [1 0]




              precision    recall  f1-score   support

           0       0.68      0.93      0.79      2749
           1       0.90      0.60      0.72      2996

    accuracy                           0.76      5745
   macro avg       0.79      0.77      0.76      5745
weighted avg       0.80      0.76      0.75      5745

Precision score: 0.9035
Recall score: 0.6031375166889186
F1 score: 0.7233787029623698


0.7226452905811623

In [14]:
X_train, y_train, X_test, y_test = modeling_df(df=dfsw, v_type ='count',scale = True)
clf = LogisticRegression(random_state=42, solver='saga').fit(X_train, y_train)
predictions = clf.predict(X_test)
print(classification_report(y_test, predictions))
print("Precision score: {}".format(precision_score(y_test, predictions)))
print("Recall score: {}".format(recall_score(y_test, predictions)))
print("F1 score: {}".format(f1_score(y_test, predictions)))

[1 0]
[1 0]
[1 0] [1 0]




              precision    recall  f1-score   support

           0       0.70      0.65      0.67      2749
           1       0.70      0.74      0.72      2996

    accuracy                           0.70      5745
   macro avg       0.70      0.70      0.70      5745
weighted avg       0.70      0.70      0.70      5745

Precision score: 0.6972735819492322
Recall score: 0.742656875834446
F1 score: 0.7192500404073057


0.7144243208279432

In [15]:
X_train, y_train, X_test, y_test = modeling_df(df=dfsw, v_type ='tfidf',scale = False)
clf = LogisticRegression(random_state=42, solver='saga').fit(X_train, y_train)
predictions = clf.predict(X_test)
print(classification_report(y_test, predictions))
print("Precision score: {}".format(precision_score(y_test, predictions)))
print("Recall score: {}".format(recall_score(y_test, predictions)))
print("F1 score: {}".format(f1_score(y_test, predictions)))

[1 0]
[1 0]
[1 0] [1 0]




              precision    recall  f1-score   support

           0       0.68      0.93      0.79      2749
           1       0.90      0.60      0.72      2996

    accuracy                           0.76      5745
   macro avg       0.79      0.77      0.76      5745
weighted avg       0.80      0.76      0.75      5745

Precision score: 0.9042606516290727
Recall score: 0.6021361815754339
F1 score: 0.7229012221999599


0.7229012221999599

In [16]:
X_train, y_train, X_test, y_test = modeling_df(df=dfsw, v_type ='count',scale = False, join=False)
print(X_train.shape)
clf = LogisticRegression(random_state=42, solver='saga').fit(X_train, y_train)
predictions = clf.predict(X_test)
print(classification_report(y_test, predictions))
print("Precision score: {}".format(precision_score(y_test, predictions)))
print("Recall score: {}".format(recall_score(y_test, predictions)))
print("F1 score: {}".format(f1_score(y_test, predictions)))

[1 0]
[1 0]
not join
[1 0] [1 0]




              precision    recall  f1-score   support

           0       0.76      0.63      0.69      2749
           1       0.71      0.82      0.76      2996

    accuracy                           0.73      5745
   macro avg       0.73      0.72      0.72      5745
weighted avg       0.73      0.73      0.72      5745

Precision score: 0.7052903967797585
Recall score: 0.8187583444592791
F1 score: 0.7578004324992277


0.7347765145615949

In [17]:
X_train, y_train, X_test, y_test = modeling_df(df=dfsw, v_type ='count',scale = True, join=False)
clf = LogisticRegression(random_state=42, solver='saga').fit(X_train, y_train)
predictions = clf.predict(X_test)
print(classification_report(y_test, predictions))
print("Precision score: {}".format(precision_score(y_test, predictions)))
print("Recall score: {}".format(recall_score(y_test, predictions)))
print("F1 score: {}".format(f1_score(y_test, predictions)))

[1 0]
[1 0]
not join
[1 0] [1 0]




              precision    recall  f1-score   support

           0       0.69      0.62      0.65      2749
           1       0.68      0.75      0.71      2996

    accuracy                           0.69      5745
   macro avg       0.69      0.68      0.68      5745
weighted avg       0.69      0.69      0.68      5745

Precision score: 0.680509245225826
Recall score: 0.7493324432576769
F1 score: 0.7132644956314536


F1 score: 0.7023657289002557


In [21]:
X_train, y_train, X_test, y_test = modeling_df(df=dfsw, v_type ='tfidf',scale = False, join=False)
print(X_train.shape)
clf = LogisticRegression(random_state=42, solver='saga').fit(X_train, y_train)
predictions = clf.predict(X_test)
print(classification_report(y_test, predictions))
print("Precision score: {}".format(precision_score(y_test, predictions)))
print("Recall score: {}".format(recall_score(y_test, predictions)))
print("F1 score: {}".format(f1_score(y_test, predictions)))

[1 0]
[1 0]
not join
[1 0] [1 0]
(13405, 13536)
              precision    recall  f1-score   support

           0       0.73      0.67      0.70      2749
           1       0.72      0.78      0.75      2996

    accuracy                           0.72      5745
   macro avg       0.73      0.72      0.72      5745
weighted avg       0.72      0.72      0.72      5745

Precision score: 0.7173645320197044
Recall score: 0.7777036048064085
F1 score: 0.746316463805253


In [12]:
# 0.7382680212868892

In [19]:
X_train.shape

(13405, 13550)

In [None]:
(13405, 13536)