# Text analysis

This notebook contains code for the  analysis of text data for the project: Public attitudes towards social media field experiments. Note that all the results were not included in the final manuscript.

## Set working directory

In [1]:
import pathlib   # Change cwd
import os 

path = pathlib.Path.cwd().parent
os.chdir(path)

## Imports

In [21]:
import yaml   # 3rd party packages
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
from matplotlib import pyplot as plt
from pandas_profiling import ProfileReport

from digex_src import config    # Local imports
from digex_src import preprocess
from digex_src.load_data import get_data_filepath

warnings.filterwarnings('ignore')    # Ignore warnings

## Plotting presets

In [22]:
# set default style
digex_style = config.MPL_STYLE_FILEPATH
digex_palette = config.PALETTE

# set size for inline polots
mpl.rcParams['figure.dpi'] = 200

plt.style.use(digex_style)
sns.color_palette(digex_palette)

## Load processed data

### 1. With config file

In [7]:
processed_data_path = get_data_filepath(
    file=config.PROCESSED_DATA_FILEPATH, 
    data_path=config.PROCESSED_DATA_DIR,
    main=False
) 

digex_df = pd.read_csv(processed_data_path, index_col=0)

digex_df.head()

Unnamed: 0,duration_sec,finished,sm_use,age,gender_id,ethnic_id,edu,politic_views,aware_sm_res,aware_sm_advan,...,rank_pub_interst,rank_add_fac_1,rank_add_fac_1_pos,rank_add_fac_2,rank_add_fac_2_pos,rank_add_fac_3,rank_add_fac_3_pos,aware_sm_advan_score,aware_sm_interact_score,aware_sm_use_score
1,912.0,True,Facebook,29.0,Male,Asian - Eastern,Highschool,Slightly liberal,Extremely aware,['… are large and can contain millions of data...,...,1.0,,,,,,,4,0,9
2,720.0,True,Twitter,33.0,Male,Mixed race,Highschool,Neutral/ Neither conservative or liberal,Moderately aware,['… are large and can contain millions of data...,...,4.0,,,,,,,1,1,9
3,1874.0,True,Facebook,33.0,Female,Pacific Islander,Bachelor's degree,Very liberal,Extremely aware,['… are large and can contain millions of data...,...,1.0,,,,,,,2,2,5
4,1264.0,True,Facebook,73.0,Female,White / Caucasian,Highschool,Slightly conservative,Moderately aware,['… are large and can contain millions of data...,...,1.0,,8.0,,,,,1,1,6
5,556.0,True,Twitter,27.0,Female,Native-American,Highschool,Very liberal,Extremely aware,['… often capture social relationships not fou...,...,7.0,,,,,,,0,3,9


### 2. With filepath

In [14]:
digex_df = pd.read_csv('/Users/jasonburton/Documents/GitHub/article-digex-survey/data/processed/digex-survey-responses-processed.csv', index_col=0)

digex_df.head()

Unnamed: 0,duration_sec,finished,sm_use,age,gender_id,ethnic_id,edu,politic_views,aware_sm_res,aware_sm_advan,...,rank_pub_interst,rank_add_fac_1,rank_add_fac_1_pos,rank_add_fac_2,rank_add_fac_2_pos,rank_add_fac_3,rank_add_fac_3_pos,aware_sm_advan_score,aware_sm_interact_score,aware_sm_use_score
1,912.0,True,Facebook,29.0,Male,Asian - Eastern,Highschool,Slightly liberal,Extremely aware,['… are large and can contain millions of data...,...,1.0,,,,,,,4,0,9
2,720.0,True,Twitter,33.0,Male,Mixed race,Highschool,Neutral/ Neither conservative or liberal,Moderately aware,['… are large and can contain millions of data...,...,4.0,,,,,,,1,1,9
3,1874.0,True,Facebook,33.0,Female,Pacific Islander,Bachelor's degree,Very liberal,Extremely aware,['… are large and can contain millions of data...,...,1.0,,,,,,,2,2,5
4,1264.0,True,Facebook,73.0,Female,White / Caucasian,Highschool,Slightly conservative,Moderately aware,['… are large and can contain millions of data...,...,1.0,,8.0,,,,,1,1,6
5,556.0,True,Twitter,27.0,Female,Native-American,Highschool,Very liberal,Extremely aware,['… often capture social relationships not fou...,...,7.0,,,,,,,0,3,9


## Exploratory data analysis

Resources:
- see `docs/resources/`
- Text as Data, Chris Bail

**Variables to examine: 15, 17, 18, 20, 21, 23, 24, 26, 27, 37, 45-50**

#### Variable 14: understanding of ethical approval

In [8]:
digex_df['ethic_appr']

1      The scope of the project and actions there in ...
2      I think Ethical Approval means that the experi...
3      Researchers focus on ethical standards towards...
4      I would think that using "ethical approval" me...
5       A set of rules of what to do and what to not do.
                             ...                        
495    Approval to do any type of thing that might be...
496    It has to with researchers taking a mental not...
497    I think ethical approval means that institutio...
498    I think ethical approval means that the experi...
499    I think ethical approval is that an academic e...
Name: ethic_appr, Length: 499, dtype: object

In [16]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter

# Make all the text lowercase
digex_df['ethic_appr'] = digex_df['ethic_appr'].str.lower()

# Remove punctuation
digex_df['ethic_appr'] = digex_df['ethic_appr'].apply(lambda x: ''.join([char for char in x if char not in string.punctuation]))

# Remove stop words
stop_words = set(stopwords.words('english'))
digex_df['ethic_appr'] = digex_df['ethic_appr'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# Lemmatize the text
lemmatizer = WordNetLemmatizer()
digex_df['ethic_appr'] = digex_df['ethic_appr'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

# Get the top 20 most frequent words
word_counts = Counter(' '.join(digex_df['ethic_appr']).split()).most_common(40)

# Print the top 20 most frequent words
print(word_counts)

[('ethical', 249), ('approval', 201), ('mean', 194), ('research', 154), ('study', 122), ('data', 109), ('institution', 98), ('would', 94), ('experiment', 91), ('researcher', 86), ('participant', 84), ('think', 78), ('harm', 73), ('standard', 65), ('sure', 56), ('people', 53), ('need', 49), ('way', 47), ('right', 46), ('information', 45), ('method', 44), ('make', 42), ('board', 41), ('used', 37), ('review', 35), ('ensure', 35), ('cause', 31), ('ethic', 31), ('must', 29), ('get', 28), ('use', 27), ('user', 26), ('getting', 26), ('morally', 25), ('collected', 25), ('without', 24), ('moral', 24), ('set', 23), ('believe', 22), ('violate', 21)]


In [17]:
# Get the top 20 most frequent bigrams
bigrams = nltk.ngrams(nltk.word_tokenize(' '.join(digex_df['ethic_appr'])), 2)
bigram_counts = Counter(bigrams).most_common(20)

# Print the top 20 most frequent bigrams
for bigram, count in bigram_counts:
    print(bigram, count)

('ethical', 'approval') 135
('approval', 'mean') 78
('make', 'sure') 36
('think', 'ethical') 34
('ethical', 'standard') 26
('think', 'mean') 22
('mean', 'researcher') 18
('cause', 'harm') 17
('harm', 'participant') 17
('mean', 'institution') 16
('research', 'study') 16
('data', 'collected') 15
('review', 'board') 13
('approval', 'would') 13
('making', 'sure') 12
('mean', 'research') 12
('getting', 'approval') 11
('mean', 'experiment') 10
('would', 'think') 10
('social', 'medium') 10


In [18]:
# Get the top 20 most frequent trigrams
trigrams = nltk.ngrams(nltk.word_tokenize(' '.join(digex_df['ethic_appr'])), 3)
trigram_counts = Counter(trigrams).most_common(20)

# Print the top 20 most frequent trigrams
for trigram, count in trigram_counts:
    print(trigram, count)

('ethical', 'approval', 'mean') 76
('think', 'ethical', 'approval') 33
('ethical', 'approval', 'would') 12
('institutional', 'review', 'board') 9
('approval', 'mean', 'researcher') 9
('accepted', 'ethical', 'standard') 7
('approval', 'mean', 'getting') 7
('approval', 'mean', 'experiment') 6
('know', 'access', 'data') 6
('participant', 'ethical', 'approval') 6
('ethical', 'standard', 'set') 6
('make', 'sure', 'experiment') 6
('right', 'know', 'access') 5
('approval', 'mean', 'institution') 5
('ethical', 'standard', 'genuine') 5
('standard', 'genuine', 'research') 5
('genuine', 'research', 'study') 5
('dignity', 'right', 'safety') 5
('mean', 'getting', 'approval') 5
('researcher', 'participant', 'research') 5


In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TfidfVectorizer object
vectorizer = TfidfVectorizer()

# Fit the vectorizer to the text data and transform it to a sparse matrix
tfidf = vectorizer.fit_transform(digex_df['ethic_appr'])

# Get the names of the features (i.e., the words)
features = vectorizer.get_feature_names_out()

# Create a dictionary mapping the feature names to their indices
feature_index = {feature: index for index, feature in enumerate(features)}

# Sort the feature names by their TF-IDF values
sorted_features = sorted(features, key=lambda feature: tfidf[:, feature_index[feature]].sum(), reverse=True)

# Print the top 20 features (i.e., words) with the highest TF-IDF values
print(sorted_features[:40])

['ethical', 'approval', 'mean', 'research', 'study', 'data', 'institution', 'experiment', 'participant', 'harm', 'would', 'think', 'researcher', 'standard', 'sure', 'right', 'way', 'people', 'method', 'need', 'make', 'cause', 'used', 'morally', 'information', 'board', 'ensure', 'getting', 'ethic', 'moral', 'get', 'permission', 'use', 'review', 'must', 'user', 'done', 'something', 'violate', 'believe']


In [26]:
import nltk
from nltk.corpus import stopwords
from gensim import corpora, models

# Remove stop words
stop_words = stopwords.words('english')
digex_df['ethic_appr'] = digex_df['ethic_appr'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# Tokenize the text
texts = [nltk.word_tokenize(text) for text in digex_df['ethic_appr']]

# Create a dictionary mapping words to their frequency
dictionary = corpora.Dictionary(texts)

# Filter out the most frequent words
dictionary.filter_extremes(no_below=10, no_above=0.2)

# Create a corpus (a list of bags of words)
corpus = [dictionary.doc2bow(text) for text in texts]

# Create an LDA model
model = models.LdaModel(corpus, num_topics=3, id2word=dictionary)

# Print the 3 topics
for topic in model.show_topics(formatted=True, num_topics=3):
    print(topic)



(0, '0.079*"study" + 0.057*"institution" + 0.056*"would" + 0.048*"standard" + 0.040*"need" + 0.033*"think" + 0.025*"experiment" + 0.023*"participant" + 0.021*"moral" + 0.020*"right"')
(1, '0.051*"harm" + 0.051*"sure" + 0.043*"data" + 0.040*"experiment" + 0.038*"make" + 0.036*"information" + 0.035*"people" + 0.032*"participant" + 0.028*"researcher" + 0.026*"standard"')
(2, '0.063*"data" + 0.049*"researcher" + 0.038*"experiment" + 0.038*"participant" + 0.036*"study" + 0.036*"institution" + 0.036*"method" + 0.031*"would" + 0.030*"think" + 0.025*"review"')
