In [8]:
# imports
import numpy as np
import pandas as pd 
import pickle
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score

In [4]:
df = pd.read_csv('https://www.dropbox.com/s/inkzg6vb5cnnz5c/modeling.csv?dl=1')
dfsw = pd.read_csv('https://www.dropbox.com/s/ye1qsevhhmat9kx/modeling_with_stopwords.csv?dl=1')
df = df.rename(columns = {'posts':'posts_from_user','suicidal':'user_is_suicidal'})
dfsw = dfsw.rename(columns = {'posts':'posts_from_user','suicidal':'user_is_suicidal'})

In [7]:
df.head()

Unnamed: 0,text,posts_from_user,word_count,day_of_week,SPACE__SP,AUX_VBP,PRON_PRP,ADV_RB,VERB_VB,SCONJ_IN,...,AUX_:,AUX_-LRB-,VERB_NNS,AUX_HYPH,AUX_PRP,ADV_CC,AUX_.,AUX_XX,AUX_WRB,user_is_suicidal
0,ever feel like everyone watch start overthink ...,8,400,2,4.76,2.75,11.36,3.66,6.59,3.3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,compare people hate make feel like you re enou...,8,427,4,5.91,1.39,10.43,5.22,6.78,2.96,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,find apologize time even thing be not really f...,8,379,4,5.11,2.46,10.8,3.6,6.06,0.76,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,read understand free soul today ill talk anoth...,8,377,2,5.59,1.12,6.15,3.35,3.35,1.68,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,be not little bit tired people tell think posi...,8,467,2,4.57,1.47,8.65,5.22,7.01,1.96,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [8]:
df.isna().sum()

text                0
posts_from_user     0
word_count          0
day_of_week         0
SPACE__SP           0
                   ..
ADV_CC              0
AUX_.               0
AUX_XX              0
AUX_WRB             0
user_is_suicidal    0
Length: 95, dtype: int64

In [5]:


def modeling_df(df,v_type='tfidf',min_ngram = 1, max_ngram = 3, scale = True, join = True):

    train, test = train_test_split(df,stratify=df['user_is_suicidal'],test_size=.3, random_state=42)
    # if scale:
    #     scaler = StandardScaler()
    #     scaler.fit_transform(train)
    #     scaler.transform(test)
    print(train['user_is_suicidal'].unique())
    print(test['user_is_suicidal'].unique())
    if v_type == 'tfidf':
        vectorizer = TfidfVectorizer(min_df=50, max_df=.60, ngram_range=(min_ngram,max_ngram))
    elif v_type == 'count':
        vectorizer = CountVectorizer(min_df=50, max_df=.60, ngram_range=(min_ngram,max_ngram))
    else:
        raise Exception("v_type must be 'tfidf' or 'count'")
    text_matrix = vectorizer.fit_transform(train['text'])
    word_features = pd.DataFrame(text_matrix.toarray(),columns = vectorizer.get_feature_names()).reset_index(drop=True)
    print(word_features.shape)
    if join:
        other_features = train.iloc[:, 1:].reset_index(drop=True)
        modeling_features = word_features.join(other_features)
    else:
        modeling_features = word_features
    test_text_matrix = vectorizer.transform(test['text'])
    test_word_features = pd.DataFrame(test_text_matrix.toarray(),columns = vectorizer.get_feature_names()).reset_index(drop=True)
    if join:
        test_other_features = test.iloc[:, 1:].reset_index(drop=True)
        test_modeling_features = test_word_features.join(test_other_features)
    else:
        test_modeling_features = test_word_features
    if join:
        X_train, y_train, X_test, y_test = modeling_features.iloc[:, :-1], modeling_features.iloc[:, -1], test_modeling_features.iloc[:, :-1],test_modeling_features.iloc[:, -1]
    else:
        print('not join')
        X_train, y_train, X_test, y_test = modeling_features, train['user_is_suicidal'],  test_modeling_features, test['user_is_suicidal']
    print(y_train.unique(),y_test.unique())
    if scale:
        scaler = StandardScaler()
        scaler.fit(X_train)
        X_train = pd.DataFrame(scaler.transform(X_train),columns= X_train.columns)
        X_test = pd.DataFrame(scaler.transform(X_test),columns= X_test.columns)
    return X_train, y_train, X_test, y_test 


# Count Vectorizor With Logistic Regression

In [7]:
X_train, y_train, X_test, y_test = modeling_df(df=dfsw, v_type ='count',scale = False)
clf = LogisticRegression(random_state=42, solver='saga').fit(X_train, y_train)
predictions = clf.predict(X_test)
print(classification_report(y_test, predictions))
print("Precision score: {}".format(precision_score(y_test, predictions)))
print("Recall score: {}".format(recall_score(y_test, predictions)))
print("F1 score: {}".format(f1_score(y_test, predictions)))

[1 0]
[1 0]
(13405, 13528)
[1 0] [1 0]




              precision    recall  f1-score   support

           0       0.68      0.93      0.79      2749
           1       0.90      0.60      0.72      2996

    accuracy                           0.76      5745
   macro avg       0.79      0.77      0.75      5745
weighted avg       0.80      0.76      0.75      5745

Precision score: 0.9033066132264529
Recall score: 0.6018024032042724
F1 score: 0.7223557692307692


0.7226452905811623

In [8]:
X_train, y_train, X_test, y_test = modeling_df(df=dfsw, v_type ='count',scale = True)
clf = LogisticRegression(random_state=42, solver='saga').fit(X_train, y_train)
predictions = clf.predict(X_test)
print(classification_report(y_test, predictions))
print("Precision score: {}".format(precision_score(y_test, predictions)))
print("Recall score: {}".format(recall_score(y_test, predictions)))
print("F1 score: {}".format(f1_score(y_test, predictions)))

[1 0]
[1 0]
(13405, 13528)
[1 0] [1 0]
              precision    recall  f1-score   support

           0       0.70      0.65      0.67      2749
           1       0.70      0.74      0.72      2996

    accuracy                           0.70      5745
   macro avg       0.70      0.70      0.70      5745
weighted avg       0.70      0.70      0.70      5745

Precision score: 0.69717868338558
Recall score: 0.7423230974632844
F1 score: 0.7190430003233107




0.7144243208279432

In [9]:
X_train, y_train, X_test, y_test = modeling_df(df=dfsw, v_type ='tfidf',scale = False)
clf = LogisticRegression(random_state=42, solver='saga').fit(X_train, y_train)
predictions = clf.predict(X_test)
print(classification_report(y_test, predictions))
print("Precision score: {}".format(precision_score(y_test, predictions)))
print("Recall score: {}".format(recall_score(y_test, predictions)))
print("F1 score: {}".format(f1_score(y_test, predictions)))

[1 0]
[1 0]
(13405, 13528)
[1 0] [1 0]




              precision    recall  f1-score   support

           0       0.68      0.93      0.79      2749
           1       0.90      0.60      0.72      2996

    accuracy                           0.76      5745
   macro avg       0.79      0.77      0.76      5745
weighted avg       0.80      0.76      0.75      5745

Precision score: 0.9042606516290727
Recall score: 0.6021361815754339
F1 score: 0.7229012221999599


0.7229012221999599

In [10]:
X_train, y_train, X_test, y_test = modeling_df(df=dfsw, v_type ='count',scale = False, join=False)
print(X_train.shape)
clf = LogisticRegression(random_state=42, solver='saga').fit(X_train, y_train)
predictions = clf.predict(X_test)
print(classification_report(y_test, predictions))
print("Precision score: {}".format(precision_score(y_test, predictions)))
print("Recall score: {}".format(recall_score(y_test, predictions)))
print("F1 score: {}".format(f1_score(y_test, predictions)))

[1 0]
[1 0]
(13405, 13528)
not join
[1 0] [1 0]
(13405, 13528)




              precision    recall  f1-score   support

           0       0.74      0.63      0.68      2749
           1       0.70      0.80      0.75      2996

    accuracy                           0.72      5745
   macro avg       0.72      0.71      0.71      5745
weighted avg       0.72      0.72      0.72      5745

Precision score: 0.7016720445878557
Recall score: 0.7983978638184246
F1 score: 0.7469164715066354


0.7347765145615949

In [11]:
X_train, y_train, X_test, y_test = modeling_df(df=dfsw, v_type ='count',scale = True, join=False)
clf = LogisticRegression(random_state=42, solver='saga').fit(X_train, y_train)
predictions = clf.predict(X_test)
print(classification_report(y_test, predictions))
print("Precision score: {}".format(precision_score(y_test, predictions)))
print("Recall score: {}".format(recall_score(y_test, predictions)))
print("F1 score: {}".format(f1_score(y_test, predictions)))

[1 0]
[1 0]
(13405, 13528)
not join
[1 0] [1 0]
              precision    recall  f1-score   support

           0       0.69      0.62      0.65      2749
           1       0.68      0.75      0.71      2996

    accuracy                           0.69      5745
   macro avg       0.69      0.68      0.68      5745
weighted avg       0.69      0.69      0.68      5745

Precision score: 0.68134872417983
Recall score: 0.7486648865153538
F1 score: 0.7134223918575064




F1 score: 0.7023657289002557


In [12]:
X_train, y_train, X_test, y_test = modeling_df(df=dfsw, v_type ='tfidf',scale = False, join=False)
print(X_train.shape)
clf = LogisticRegression(random_state=42, solver='saga').fit(X_train, y_train)
predictions = clf.predict(X_test)
print(classification_report(y_test, predictions))
print("Precision score: {}".format(precision_score(y_test, predictions)))
print("Recall score: {}".format(recall_score(y_test, predictions)))
print("F1 score: {}".format(f1_score(y_test, predictions)))

[1 0]
[1 0]
(13405, 13528)
not join
[1 0] [1 0]
(13405, 13528)
              precision    recall  f1-score   support

           0       0.73      0.66      0.70      2749
           1       0.72      0.78      0.74      2996

    accuracy                           0.72      5745
   macro avg       0.72      0.72      0.72      5745
weighted avg       0.72      0.72      0.72      5745

Precision score: 0.7154721624115656
Recall score: 0.7763684913217623
F1 score: 0.7446774451736834


In [6]:
X_train, y_train, X_test, y_test = modeling_df(df=dfsw, v_type ='tfidf', scale=True)

solver = ['saga']
penalty = ['l2', 'l1', 'elasticnet']
c_values = [0.001, 0.01, 0.1, 1, 10, 100]

lr_params = {'solver': solver, 'penalty': penalty, 'C': c_values}

grid_search_cv_lr = GridSearchCV(estimator = LogisticRegression(random_state=42),
                                param_grid = lr_params, cv=5)

grid_search_cv_lr.fit(X_train, y_train)

grid_search_cv_lr.best_params_

[1 0]
[1 0]
(13405, 13528)
[1 0] [1 0]


Traceback (most recent call last):
  File "/Users/farhad/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/farhad/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1314, in fit
    raise ValueError("l1_ratio must be between 0 and 1;"
ValueError: l1_ratio must be between 0 and 1; got (l1_ratio=None)

Traceback (most recent call last):
  File "/Users/farhad/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/farhad/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1314, in fit
    raise ValueError("l1_ratio must be between 0 and 1;"
ValueError: l1_ratio must be between 0 and 1; got (l1_ratio=None)

Traceback (most recent call last):
  File "/Users/farhad/opt/anaconda3/lib/pytho

{'C': 0.001, 'penalty': 'l1', 'solver': 'saga'}

In [7]:
clf = LogisticRegression(random_state=42, solver='saga', C=0.001, penalty='l1').fit(X_train, y_train)
predictions = clf.predict(X_test)
print(classification_report(y_test, predictions))
print("Precision score: {}".format(precision_score(y_test, predictions)))
print("Recall score: {}".format(recall_score(y_test, predictions)))
print("F1 score: {}".format(f1_score(y_test, predictions)))

              precision    recall  f1-score   support

           0       0.75      0.83      0.79      2749
           1       0.83      0.75      0.79      2996

    accuracy                           0.79      5745
   macro avg       0.79      0.79      0.79      5745
weighted avg       0.79      0.79      0.79      5745

Precision score: 0.830991124260355
Recall score: 0.75
F1 score: 0.788421052631579


In [9]:
# pickle the model
pickle.dump(clf, open('clf.pkl', 'wb'))

# Naive Bayes Classifiers

### Count verctorizer

In [16]:
X_train, y_train, X_test, y_test = modeling_df(df=dfsw, v_type ='count',scale = False, join=False)
print(X_train.shape)
MNB_clf = MultinomialNB().fit(X_train, y_train)
MNB_predictions = MNB_clf.predict(X_test)
print(classification_report(y_test, MNB_predictions))
print("Precision score: {}".format(precision_score(y_test, MNB_predictions)))
print("Recall score: {}".format(recall_score(y_test, MNB_predictions)))
print("F1 score: {}".format(f1_score(y_test, MNB_predictions)))

[1 0]
[1 0]
(13405, 13528)
not join
[1 0] [1 0]
(13405, 13528)
              precision    recall  f1-score   support

           0       0.71      0.65      0.68      2749
           1       0.70      0.76      0.73      2996

    accuracy                           0.71      5745
   macro avg       0.71      0.70      0.70      5745
weighted avg       0.71      0.71      0.70      5745

Precision score: 0.702048417132216
Recall score: 0.7550066755674232
F1 score: 0.7275651334834352


In [17]:
X_train, y_train, X_test, y_test = modeling_df(df=dfsw, v_type ='count', scale=False)
print(X_train.shape)
MNB_clf = MultinomialNB().fit(X_train, y_train)
MNB_predictions = MNB_clf.predict(X_test)
print(classification_report(y_test, MNB_predictions))
print("Precision score: {}".format(precision_score(y_test, MNB_predictions)))
print("Recall score: {}".format(recall_score(y_test, MNB_predictions)))
print("F1 score: {}".format(f1_score(y_test, MNB_predictions)))

[1 0]
[1 0]
(13405, 13528)
[1 0] [1 0]
(13405, 13621)
              precision    recall  f1-score   support

           0       0.65      0.95      0.77      2749
           1       0.92      0.54      0.68      2996

    accuracy                           0.73      5745
   macro avg       0.79      0.74      0.73      5745
weighted avg       0.79      0.73      0.72      5745

Precision score: 0.9244521337946944
Recall score: 0.5350467289719626
F1 score: 0.6778012684989428


### TF-IDF Vectorizer

In [18]:
X_train, y_train, X_test, y_test = modeling_df(df=dfsw, v_type ='tfidf', scale=False, join=False)
print(X_train.shape)
MNB_clf = MultinomialNB().fit(X_train, y_train)
MNB_predictions = MNB_clf.predict(X_test)
print(classification_report(y_test, MNB_predictions))
print("Precision score: {}".format(precision_score(y_test, MNB_predictions)))
print("Recall score: {}".format(recall_score(y_test, MNB_predictions)))
print("F1 score: {}".format(f1_score(y_test, MNB_predictions)))

[1 0]
[1 0]
(13405, 13528)
not join
[1 0] [1 0]
(13405, 13528)
              precision    recall  f1-score   support

           0       0.71      0.64      0.67      2749
           1       0.70      0.77      0.73      2996

    accuracy                           0.70      5745
   macro avg       0.71      0.70      0.70      5745
weighted avg       0.71      0.70      0.70      5745

Precision score: 0.6967859308671922
Recall score: 0.767022696929239
F1 score: 0.7302192564346996


In [19]:
X_train, y_train, X_test, y_test = modeling_df(df=dfsw, v_type ='tfidf', scale=False)
print(X_train.shape)
MNB_clf = MultinomialNB().fit(X_train, y_train)
MNB_predictions = MNB_clf.predict(X_test)
print(classification_report(y_test, MNB_predictions))
print("Precision score: {}".format(precision_score(y_test, MNB_predictions)))
print("Recall score: {}".format(recall_score(y_test, MNB_predictions)))
print("F1 score: {}".format(f1_score(y_test, MNB_predictions)))

[1 0]
[1 0]
(13405, 13528)
[1 0] [1 0]
(13405, 13621)
              precision    recall  f1-score   support

           0       0.65      0.96      0.77      2749
           1       0.93      0.52      0.67      2996

    accuracy                           0.73      5745
   macro avg       0.79      0.74      0.72      5745
weighted avg       0.79      0.73      0.72      5745

Precision score: 0.9310137972405519
Recall score: 0.5180240320427236
F1 score: 0.6656658803345485


# Random Forest Classifier

In [20]:
#for model tuning
from sklearn.model_selection import GridSearchCV
import numpy as np

## TF-IDF Vectorizer

In [15]:
X_train, y_train, X_test, y_test = modeling_df(df=dfsw, v_type ='tfidf', scale=False)

max_depth = [1,2,3,4,5,6,7,8,9,10, None]
n_estimators = [10,20,30,40,50,60,70,80,90,100]

rf_params = {'max_depth':max_depth, 'n_estimators': n_estimators}

grid_search_cv_rf = GridSearchCV(RandomForestClassifier(random_state=42),
                                rf_params, cv=5, scoring='accuracy')

grid_search_cv_rf.fit(X_train, y_train)

grid_search_cv_rf.best_params_

[1 0]
[1 0]
(13405, 13528)
[1 0] [1 0]


{'max_depth': None, 'n_estimators': 50}

In [16]:
rf = RandomForestClassifier(random_state=42, max_depth= None, n_estimators=50).fit(X_train, y_train)
rf_predictions = rf.predict(X_test)
print(classification_report(y_test, rf_predictions))
print("Precision score: {}".format(precision_score(y_test, rf_predictions)))
print("Recall score: {}".format(recall_score(y_test, rf_predictions)))
print("F1 score: {}".format(f1_score(y_test, rf_predictions)))

              precision    recall  f1-score   support

           0       0.79      0.80      0.79      2749
           1       0.81      0.81      0.81      2996

    accuracy                           0.80      5745
   macro avg       0.80      0.80      0.80      5745
weighted avg       0.80      0.80      0.80      5745

Precision score: 0.8126675603217158
Recall score: 0.8094125500667557
F1 score: 0.8110367892976589


In [17]:
# pickle the model
pickle.dump(rf, open('rf.pkl', 'wb'))

In [14]:
rf = RandomForestClassifier(random_state=42).fit(X_train, y_train)
rf_predictions = rf.predict(X_test)
print(classification_report(y_test, rf_predictions))
print("Precision score: {}".format(precision_score(y_test, rf_predictions)))
print("Recall score: {}".format(recall_score(y_test, rf_predictions)))
print("F1 score: {}".format(f1_score(y_test, rf_predictions)))

              precision    recall  f1-score   support

           0       0.79      0.79      0.79      2749
           1       0.80      0.81      0.81      2996

    accuracy                           0.80      5745
   macro avg       0.80      0.80      0.80      5745
weighted avg       0.80      0.80      0.80      5745

Precision score: 0.80488610102344
Recall score: 0.8137516688918558
F1 score: 0.8092946058091286


In [25]:
X_train, y_train, X_test, y_test = modeling_df(df=dfsw, v_type ='tfidf', scale=False)
print(X_train.shape)
rf = RandomForestClassifier(random_state=42).fit(X_train, y_train)
rf_predictions = rf.predict(X_test)
print(classification_report(y_test, rf_predictions))
print("Precision score: {}".format(precision_score(y_test, rf_predictions)))
print("Recall score: {}".format(recall_score(y_test, rf_predictions)))
print("F1 score: {}".format(f1_score(y_test, rf_predictions)))

[1 0]
[1 0]
(13405, 13528)
[1 0] [1 0]
(13405, 13621)
              precision    recall  f1-score   support

           0       0.79      0.79      0.79      2749
           1       0.80      0.81      0.81      2996

    accuracy                           0.80      5745
   macro avg       0.80      0.80      0.80      5745
weighted avg       0.80      0.80      0.80      5745

Precision score: 0.80488610102344
Recall score: 0.8137516688918558
F1 score: 0.8092946058091286


## Count vectorizer

In [26]:
X_train, y_train, X_test, y_test = modeling_df(df=dfsw, v_type ='count', scale=False,)
print(X_train.shape)
rf = RandomForestClassifier(random_state=42).fit(X_train, y_train)
rf_predictions = rf.predict(X_test)
print(classification_report(y_test, rf_predictions))
print("Precision score: {}".format(precision_score(y_test, rf_predictions)))
print("Recall score: {}".format(recall_score(y_test, rf_predictions)))
print("F1 score: {}".format(f1_score(y_test, rf_predictions)))

[1 0]
[1 0]
(13405, 13528)
[1 0] [1 0]
(13405, 13621)
              precision    recall  f1-score   support

           0       0.78      0.78      0.78      2749
           1       0.80      0.79      0.80      2996

    accuracy                           0.79      5745
   macro avg       0.79      0.79      0.79      5745
weighted avg       0.79      0.79      0.79      5745

Precision score: 0.7992602555480834
Recall score: 0.7933911882510013
F1 score: 0.7963149078726969


In [27]:
X_train, y_train, X_test, y_test = modeling_df(df=dfsw, v_type ='tfidf', scale=False, join=False)
print(X_train.shape)
rf = RandomForestClassifier().fit(X_train, y_train)
rf_predictions = rf.predict(X_test)
print(classification_report(y_test, rf_predictions))
print("Precision score: {}".format(precision_score(y_test, rf_predictions)))
print("Recall score: {}".format(recall_score(y_test, rf_predictions)))
print("F1 score: {}".format(f1_score(y_test, rf_predictions)))

[1 0]
[1 0]
(13405, 13528)
not join
[1 0] [1 0]
(13405, 13528)
              precision    recall  f1-score   support

           0       0.69      0.67      0.68      2749
           1       0.70      0.73      0.72      2996

    accuracy                           0.70      5745
   macro avg       0.70      0.70      0.70      5745
weighted avg       0.70      0.70      0.70      5745

Precision score: 0.7044430135222151
Recall score: 0.7303070761014686
F1 score: 0.7171419206817436


In [28]:
X_train, y_train, X_test, y_test = modeling_df(df=dfsw, v_type ='count', scale=False, join=False)
print(X_train.shape)
rf = RandomForestClassifier().fit(X_train, y_train)
rf_predictions = rf.predict(X_test)
print(classification_report(y_test, rf_predictions))
print("Precision score: {}".format(precision_score(y_test, rf_predictions)))
print("Recall score: {}".format(recall_score(y_test, rf_predictions)))
print("F1 score: {}".format(f1_score(y_test, rf_predictions)))

[1 0]
[1 0]
(13405, 13528)
not join
[1 0] [1 0]
(13405, 13528)
              precision    recall  f1-score   support

           0       0.67      0.69      0.68      2749
           1       0.71      0.69      0.70      2996

    accuracy                           0.69      5745
   macro avg       0.69      0.69      0.69      5745
weighted avg       0.69      0.69      0.69      5745

Precision score: 0.7090220385674931
Recall score: 0.6872496662216289
F1 score: 0.6979661016949152


## SVM Classifier

### TF-IDF Vectorizer

In [30]:
X_train, y_train, X_test, y_test = modeling_df(df=dfsw, v_type ='tfidf', scale=False)

C = [0.001, 0.01]

rf_params = {'C':C}

grid_search_cv_svm = GridSearchCV(SVC(random_state=42),
                                rf_params, cv=10)

grid_search_cv_svm.fit(X_train, y_train)

grid_search_cv_svm.best_params_

[1 0]
[1 0]
(13405, 13528)
[1 0] [1 0]


In [29]:
X_train, y_train, X_test, y_test = modeling_df(df=dfsw, v_type ='tfidf', scale=False, join=False)
print(X_train.shape)
svc_model = SVC(C=0.001, kernel="linear", random_state=42).fit(X_train, y_train)
svc_predictions = svc_model.predict(X_test)
print(classification_report(y_test, svc_predictions))
print("Precision score: {}".format(precision_score(y_test, svc_predictions)))
print("Recall score: {}".format(recall_score(y_test, svc_predictions)))
print("F1 score: {}".format(f1_score(y_test, svc_predictions)))

[1 0]
[1 0]
(13405, 13528)
not join
[1 0] [1 0]
(13405, 13528)
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      2749
           1       0.52      1.00      0.69      2996

    accuracy                           0.52      5745
   macro avg       0.26      0.50      0.34      5745
weighted avg       0.27      0.52      0.36      5745

Precision score: 0.5214969538729329
Recall score: 1.0
F1 score: 0.685505090950692


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
