In [4]:
import spacy
import pandas as pd

In [2]:
# default model for English
nlp = spacy.load('en_core_web_sm')

In [81]:
# read in data
df1 = pd.read_csv("data/survey-data-1.csv")
df2 = pd.read_csv("data/survey-data-2.csv")

df = pd.concat([df1, df2], axis=0)

df.head()

Unnamed: 0,Respondent,Why didn't get booster,why_not_booster_1,why_not_booster_2,why_not_booster_3,Why public health scientists reliable or unreliable,why_scientists_reliable_unreliable_1,why_scientists_reliable_unreliable_2,why_scientists_reliable_unreliable_3,Why CDC changed recommended isolation period,why_isolation_change_1,why_isolation_change_2,why_isolation_change_3
0,1721090000.0,__NA__,,,,i have no idea,99,,,because they are unsure,98.0,,
1,1721137000.0,__NA__,,,,What they say confirms the data I see.,5,,,Evidence,1.0,,
2,1721195000.0,__NA__,,,,Basically every public health scientist in the...,98,,,Scientific and economic reasons,1.0,5.0,
3,,__NA__,,,,They can only do the best they can with limite...,4,,,It was the right thing to do.,1.0,,
4,1721137000.0,__NA__,,,,Public health scientists are working daily to ...,4,,,I think it should be a 7-day minimum (days 0-6...,98.0,,


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 901 entries, 0 to 400
Data columns (total 13 columns):
 #   Column                                               Non-Null Count  Dtype  
---  ------                                               --------------  -----  
 0   Respondent                                           899 non-null    float64
 1   Why didn't get booster                               901 non-null    object 
 2   why_not_booster_1                                    207 non-null    float64
 3   why_not_booster_2                                    45 non-null     float64
 4   why_not_booster_3                                    10 non-null     float64
 5   Why public health scientists reliable or unreliable  887 non-null    object 
 6   why_scientists_reliable_unreliable_1                 901 non-null    int64  
 7   why_scientists_reliable_unreliable_2                 180 non-null    float64
 8   why_scientists_reliable_unreliable_3                 31 non-null     f

In [9]:
df.columns

Index(['Respondent', 'Why didn't get booster', 'why_not_booster_1',
       'why_not_booster_2', 'why_not_booster_3',
       'Why public health scientists reliable or unreliable',
       'why_scientists_reliable_unreliable_1',
       'why_scientists_reliable_unreliable_2',
       'why_scientists_reliable_unreliable_3',
       'Why CDC changed recommended isolation period',
       'why_isolation_change_1', 'why_isolation_change_2',
       'why_isolation_change_3'],
      dtype='object')

In [83]:
def is_token_allowed(token):
    if (not token or not token.text.strip() or
        token.is_stop or token.is_punct):
            return False
    return True

# tokenization, lemmatization and stop-word + punctuation removal
def preprocess_text(x):
    if type(x) is str:
        tokenized_text = nlp(x)
        return " ".join([token.lemma_.strip().lower() for token in nlp(x) if is_token_allowed(token)])


In [84]:
df['Why public health scientists reliable or unreliable'] = df['Why public health scientists reliable or unreliable'].apply(preprocess_text)
df['Why CDC changed recommended isolation period'] = df['Why CDC changed recommended isolation period'].apply(preprocess_text)

In [85]:
df.head()

Unnamed: 0,Respondent,Why didn't get booster,why_not_booster_1,why_not_booster_2,why_not_booster_3,Why public health scientists reliable or unreliable,why_scientists_reliable_unreliable_1,why_scientists_reliable_unreliable_2,why_scientists_reliable_unreliable_3,Why CDC changed recommended isolation period,why_isolation_change_1,why_isolation_change_2,why_isolation_change_3
0,1721090000.0,__NA__,,,,idea,99,,,unsure,98.0,,
1,1721137000.0,__NA__,,,,confirm datum,5,,,evidence,1.0,,
2,1721195000.0,__NA__,,,,basically public health scientist world focus ...,98,,,scientific economic reason,1.0,5.0,
3,,__NA__,,,,good limited information,4,,,right thing,1.0,,
4,1721137000.0,__NA__,,,,public health scientist work daily tidal wave ...,4,,,think 7 day minimum day 0 6 negative rats day ...,98.0,,


In [92]:
# calculate TF-IDF score for "why public health scientiests reliable or unreliable"
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer


text = df["Why public health scientists reliable or unreliable"].dropna().to_list()

#using the count vectorizer
count = CountVectorizer()
word_count=count.fit_transform(text)

In [94]:
print("n_samples: %d, n_words: %d" % word_count.shape)

n_samples: 887, n_words: 1285


In [95]:
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count)
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=count.get_feature_names(),columns=["idf_weights"])



In [97]:
#inverse document frequency
df_idf.sort_values(by=['idf_weights'])[-10:]

Unnamed: 0,idf_weights
inaccurate,7.095825
incentive,7.095825
indoor,7.095825
inevitable,7.095825
inexpensive,7.095825
infallible,7.095825
infect,7.095825
infection,7.095825
killing,7.095825
zoom,7.095825


In [99]:
#tfidf
tf_idf_vector=tfidf_transformer.transform(word_count)
feature_names = count.get_feature_names()
first_document_vector=tf_idf_vector[1]
df_tfifd= pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"])

In [100]:
df_tfifd.sort_values(by=["tfidf"],ascending=False)

Unnamed: 0,tfidf
confirm,0.867154
datum,0.498039
000,0.000000
physical,0.000000
plan,0.000000
...,...
felch,0.000000
feel,0.000000
federal,0.000000
fed,0.000000
