# Model data - Logistic Regression

This code imports the clean data csv file and does the following:
1. Balances the positive and negative data by sampling
2. Divides the data into training and testing data
3. Uses count vectorizer and Logistic Regression model with RandomizedSearchCV to tune for the best hyperparameter 'C'
4. Joins the feature coefficients with their names

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_extraction import text
# Set up training and test data
from sklearn.model_selection import train_test_split

df = pd.read_csv('data/df_article_text.csv', sep=',')

df_neg = df[df['label'] == 0]
df_pos = df[df['label'] == 1]

# df_neg has less number of cases. Selecting same number of rows from df_pos
df_pos_sample = df_pos.sample(n = len(df_neg), random_state = 0)

df_balanced = pd.concat([df_pos_sample, df_neg], ignore_index=True)

X = df_balanced['article_text']
y = df_balanced['label']

In [2]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

stop_words = text.ENGLISH_STOP_WORDS.union(['people', 'said', 'time', 'coronavirus', 'COVID', 'Audio'])

pipeline = Pipeline([('count_vectorizer', CountVectorizer(min_df = 0.1,
                                                         ngram_range= (1,3),
                                                         stop_words=stop_words)),
                     ('log_reg', LogisticRegression())])

param = {'log_reg__C': [0.001, 0.01, 0.1, 1, 10]}

randomized_pipe = RandomizedSearchCV(estimator = pipeline, 
                                         param_distributions=param, 
                                         cv=3, n_iter=30, n_jobs=-1)

log_reg_model = randomized_pipe.fit(X_train, y_train)

print('The model score with test data: ', log_reg_model.score(X_test, y_test))

# collect the individual steps
log_reg_step = randomized_pipe.best_estimator_.named_steps['log_reg']
count_vectorizer_step = randomized_pipe.best_estimator_.named_steps['count_vectorizer']

  'stop_words.' % sorted(inconsistent))


The model score with test data:  0.8527131782945736


In [18]:
# data frame for the feature coefficients
df_coef = pd.DataFrame(log_reg_step.coef_)

# data frame of feature names
df_features = pd.DataFrame(count_vectorizer_step.get_feature_names())
df_features = df_features.transpose()

df_coef_feature_names = pd.concat((df_features, df_coef), ignore_index=True)
df_coef_feature_names = df_coef_feature_names.transpose()
df_coef_feature_names = df_coef_feature_names.sort_values(by=[1], ascending=False)

In [19]:
df_coef_feature_names.head(n=10)

Unnamed: 0,0,1
910,wuhan,0.264054
157,com,0.239918
148,chinese,0.23973
133,ccp,0.235165
179,contact,0.2103
394,https,0.193961
505,masks,0.183439
528,millions,0.182007
439,july,0.174998
725,second,0.166001


In [20]:
df_coef_feature_names.tail(n=10)

Unnamed: 0,0,1
812,tested,-0.15848
691,response,-0.158831
717,says,-0.160923
767,spread,-0.162062
475,live,-0.179037
117,business,-0.18307
285,european,-0.187124
284,europe,-0.191816
204,crisis,-0.27295
184,coronavirus,-0.48046
