In [1]:
import pandas as pd 
from collections import Counter
import string
# Importing nltk resources for text preprocessing
import nltk
# nltk.download('punkt')
# nltk.download('stopwords')

# Data Loading

In [2]:
file_path = "data/ai-perception-concat-paragraphs.csv"
data = pd.read_csv(file_path)

# display first 5 rows of data
data.head()

Unnamed: 0,Article ID,Article Date,NYT section,Paragraph,Title,Fiction,Cyborg (positive),Decisions (positive),Education (positive),Entertain (positive),...,Cyborg (negative),Ethics (negative),Military (negative),Progress (negative),Singularity (negative),Work (negative),AI Mood,AI Relevance,Other (negative),Other (positive)
0,4fd100e58eb7c8105d5bbb33,2012-04-01 00:00:00 UTC,Arts; Style; Magazine,9. The robot designs woven into these tea towe...,Stijl Council,0,0,0,0,0,...,0,0,0,0,0,0,3.0,2.333333,,
1,4fd100e88eb7c8105d5bbd2d,,Science; Health,"In 1818, Mary Shelley's ''Frankenstein'' raise...",Statues to Golems to R2-D2,2,0,0,0,0,...,0,1,2,0,1,0,2.333333,5.0,,
2,4fd14a668eb7c8105d627c40,,Business; Opinion,Mr. Culbertson would use ''powerful new tools'...,ECONOMIC POLICY,0,0,2,0,0,...,0,0,0,0,0,0,4.0,4.333333,,manage entire economies AI and computers could...
3,4fd14a678eb7c8105d627d24,1986-02-16 00:00:00 UTC,Business,"Some golfer. Fortunately, you'll never have to...",IN PURSUIT OF THE PERFECT GOLF BALL,0,0,0,0,0,...,0,0,0,0,0,0,3.0,4.0,,
4,4fd14a678eb7c8105d627d35,1986-02-16 00:00:00 UTC,U.S.,All week long Navy divers and salvage experts ...,LATEST PICTURES FROM NASA SHOW WODER FIRE ON B...,0,0,0,0,0,...,0,0,0,0,0,0,3.0,3.333333,,


In [3]:
print("Rows:", data.shape[0])
# Count NaNs in 'Article Date' column
print("Number of NaNs in 'Article Date' column: ", data['Article Date'].isnull().sum(), f"({round(data['Article Date'].isnull().sum()/data.shape[0]*100, 2)}%)")

Rows: 3365
Number of NaNs in 'Article Date' column:  1418 (42.14%)


In [58]:
# change 'Article Date' in datetime64[ns, UTC] format and change to 'Article Year' in int64 format
# Convert the 'Article Date' column to datetime, coercing errors
data['Article Date'] = pd.to_datetime(data['Article Date'], errors='coerce')
data['Article Year'] = data['Article Date'].dt.year
# convert to int64 - ignore nulls
data['Article Year'] = data['Article Year'].fillna(0)
data['Article Year'] = data['Article Year'].astype('int64')

data.head()

Unnamed: 0,Article ID,Article Date,NYT section,Paragraph,Title,Fiction,Cyborg (positive),Decisions (positive),Education (positive),Entertain (positive),...,Military (negative),Progress (negative),Singularity (negative),Work (negative),AI Mood,AI Relevance,Other (negative),Other (positive),Article Year,Processed_Paragraph
0,4fd100e58eb7c8105d5bbb33,2012-04-01 00:00:00+00:00,Arts; Style; Magazine,9. The robot designs woven into these tea towe...,Stijl Council,0,0,0,0,0,...,0,0,0,0,3.0,2.333333,,,2012,"[9, robot, designs, woven, into, these, tea, t..."
1,4fd100e88eb7c8105d5bbd2d,NaT,Science; Health,"In 1818, Mary Shelley's ''Frankenstein'' raise...",Statues to Golems to R2-D2,2,0,0,0,0,...,2,0,1,0,2.333333,5.0,,,0,"[1818, mary, frankenstein, raised, specter, ma..."
2,4fd14a668eb7c8105d627c40,NaT,Business; Opinion,Mr. Culbertson would use ''powerful new tools'...,ECONOMIC POLICY,0,0,2,0,0,...,0,0,0,0,4.0,4.333333,,manage entire economies AI and computers could...,0,"[culbertson, powerful, tools, model, manage, e..."
3,4fd14a678eb7c8105d627d24,1986-02-16 00:00:00+00:00,Business,"Some golfer. Fortunately, you'll never have to...",IN PURSUIT OF THE PERFECT GOLF BALL,0,0,0,0,0,...,0,0,0,0,3.0,4.0,,,1986,"[some, golfer, fortunately, youll, never, meet..."
4,4fd14a678eb7c8105d627d35,1986-02-16 00:00:00+00:00,U.S.,All week long Navy divers and salvage experts ...,LATEST PICTURES FROM NASA SHOW WODER FIRE ON B...,0,0,0,0,0,...,0,0,0,0,3.0,3.333333,,,1986,"[week, long, navy, divers, salvage, experts, u..."


# Text Preprocessing

### Define Stop Words

In [59]:
# Define common English stop words - set class data type
stop_words = {'thus', 'the', 'and', 'to', 'of', 'a', 'in', 'that', 'is', 'was', 'he', 'for', 'it', 'with', 'as', 'his', 'on', 'be', 'at', 'by', 'i', 'this', 'had', 'not', 'but', 'from', 'or', 'have', 'an', 'they', 'which', 'you', 'were', 'her', 'their', 'we', 'its', 'said', 'like', '—', '——', '-', '--'}

# load frequent words dataset - columns are 'word' and 'count'
freq_words = pd.read_csv('data/unigram_freq.csv')

# add 1 and 2 letter words to the stop_words set if they are not already there
for i in range(len(freq_words['word'])):
    word = str(freq_words['word'][i])
    if len(word) <= 2:
        stop_words.add(word)

# add stop words with count more than 550,000,000
for i in range(len(freq_words['word'])):
    word = str(freq_words['word'][i])
    if freq_words['count'][i] > 550000000:
        stop_words.add(word)

# create pandas series with words with count less than 100,000
less_freq_words = freq_words[freq_words['count'] < 100000]['word'].tail(100000).tolist()
# add less_freq_words to stop_words set
stop_words = stop_words.union(set(less_freq_words))

In [60]:
# freq_words[freq_words['count'] > 550000000].shape
freq_words[freq_words['count'] < 100000].shape

(233846, 2)

In [61]:
print(len(stop_words))

100774


### Tokenize Text

In [62]:
# Function to tokenize text (split by spaces and remove punctuation)
def custom_tokenize(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize by spaces
    tokens = text.lower().split()
    return tokens

# Updated function to preprocess text (tokenize and remove stop words) with handling for missing or non-string values
def custom_preprocess_text(text):
    if pd.isnull(text) or not isinstance(text, str):
        return []
    
    # Tokenize the text
    tokens = custom_tokenize(text)
    # Remove stop words
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

In [63]:
# Apply the custom preprocessing function to the 'Paragraph' column
data['Processed_Paragraph'] = data['Paragraph'].apply(custom_preprocess_text)

# Preview the processed paragraphs of the first few rows
data[['Paragraph', 'Processed_Paragraph']].head()

Unnamed: 0,Paragraph,Processed_Paragraph
0,9. The robot designs woven into these tea towe...,"[9, robot, designs, woven, into, these, tea, t..."
1,"In 1818, Mary Shelley's ''Frankenstein'' raise...","[1818, mary, frankenstein, raised, specter, ma..."
2,Mr. Culbertson would use ''powerful new tools'...,"[culbertson, powerful, tools, model, manage, e..."
3,"Some golfer. Fortunately, you'll never have to...","[some, golfer, fortunately, youll, never, meet..."
4,All week long Navy divers and salvage experts ...,"[week, long, navy, divers, salvage, experts, u..."


## Task 1: Word Associations (Positive and Negative)

To find words with the most positive and negative associations, we analyze the processed paragraphs in conjunction with the positive and negative columns in the dataset. We create functions to aggregate the word counts based on these associations.

In [64]:
# Define columns representing positive and negative associations
positive_columns = ['Cyborg (positive)', 'Decisions (positive)', 'Education (positive)', 'Entertain (positive)', 'Healthcare (positive)', 'Singularity (positive)', 'Transportation (positive)', 'Work (positive)']
negative_columns = ['Controling AI (negative)', 'Cyborg (negative)', 'Ethics (negative)', 'Military (negative)', 'Progress (negative)', 'Singularity (negative)', 'Work (negative)']
# 

# Define remaining columns
all_columns = data.columns.tolist()
remaining_columns = [col for col in all_columns if col not in positive_columns and col not in negative_columns]

# Function to count words with positive and negative associations
def count_associations(row):
    # Extract processed paragraph
    words = row['Processed_Paragraph']
    # Initialize counters for positive and negative associations
    positive_counter = Counter()
    negative_counter = Counter()
    # Iterate through words and update counters based on associations in the row
    for word in words:
        if any(row[col] > 0 for col in positive_columns):
            positive_counter[word] += 1
        if any(row[col] > 0 for col in negative_columns):
            negative_counter[word] += 1
    return positive_counter, negative_counter

In [65]:
# Apply the function to count word associations
positive_word_counts = Counter()
negative_word_counts = Counter()
# Create word count dataframe with 'Article ID', 'Article Date', 'NYT Section', 'Title', 'AI Mood', 'AI Relevance', '
for _, row in data.iterrows():
    positive, negative = count_associations(row)
    positive_word_counts += positive
    negative_word_counts += negative

In [66]:
print('robot (positive):', positive_word_counts['robot'])
print('robot (negative):', negative_word_counts['robot'])

robot (positive): 2084
robot (negative): 745


### Validity Checks

In [67]:
# print length of positive_word_counts and negative_word_counts dictionaries
print('positive word count:', len(positive_word_counts))
print('negative word count:', len(negative_word_counts))

positive word count: 20225
negative word count: 13521


In [68]:
# check if 'robot' is contained in negative_word_counts
print('robot in positive words:', 'robot' in positive_word_counts, '\nrobot in negative words:', 'robot' in negative_word_counts)

print('robot (positive):', positive_word_counts['robot'])
print('robot (negative):', negative_word_counts['robot'])

robot in positive words: True 
robot in negative words: True
robot (positive): 2084
robot (negative): 745


Check top positively and negatively associated words

In [69]:
# Get top 10 positive and negative words
top_positive_words = positive_word_counts.most_common(10)
top_negative_words = negative_word_counts.most_common(10)

# top_positive_words, top_negative_words
print('top positive:', top_positive_words, '\ntop negative:', top_negative_words)

top positive: [('robot', 2084), ('intelligence', 1891), ('artificial', 1834), ('computer', 641), ('human', 590), ('technology', 552), ('robots', 502), ('could', 482), ('than', 460), ('into', 441)] 
top negative: [('intelligence', 1220), ('artificial', 1156), ('robot', 745), ('human', 421), ('computer', 381), ('technology', 346), ('robots', 285), ('people', 262), ('could', 260), ('times', 259)]


### Pivot Sentiment Columns into Rows

In [70]:
# Pivot the sentiment columns into rows creating a new row for each sentiment that has a non-zero entry
data_melt = pd.melt(data, id_vars=remaining_columns, value_vars=positive_columns + negative_columns, var_name='Sentiment_Type', value_name='Sentiment_Value')

# drop zero entries from 'Value' column
data_melt_nonzero = data_melt[data_melt['Sentiment_Value'] != 0]
# Preview pivotted nonzero table
data_melt_nonzero.head()

Unnamed: 0,Article ID,Article Date,NYT section,Paragraph,Title,Fiction,AI Mood,AI Relevance,Other (negative),Other (positive),Article Year,Processed_Paragraph,Sentiment_Type,Sentiment_Value
16,4fd156a18eb7c8105d63aed2,1986-08-10 00:00:00+00:00,U.S.,The robot must be keyed to a single individual...,"TO ASSIST HANDICAPPED, A ROBOT THAT CAN HEAR",0,4.333333,5.0,,could be the tool that a handicapped person ...,1986,"[robot, must, keyed, single, individuals, voic...",Cyborg (positive),2
115,4fd16d848eb7c8105d660007,NaT,U.S.,"The F.B.I. is enthusiastic about Big Floyd, wh...",'BIG FLOYD' JOINS THE FORCE,0,3.666667,5.0,,Criminal Investigation Lesser Crimes diffe...,0,"[fbi, enthusiastic, big, floyd, whose, namesak...",Cyborg (positive),1
125,4fd1707b8eb7c8105d66317b,1986-08-10 00:00:00+00:00,Arts,"''Condor,'' on the other hand, tries to be amu...",WHEN THE SLUSH PILE COMES TO LIFE,1,3.0,3.333333,,,1986,"[condor, hand, tries, amusing, after, hero, se...",Cyborg (positive),1
162,4fd1781e8eb7c8105d66ef93,NaT,Business,"''The Tomorrow Makers,'' by Grant Fjermedal ($...",HOW TO AVOID TUNNEL VISION,0,3.111111,4.666667,AI minds are wildly different from hum...,robotic immortality,0,"[tomorrow, makers, grant, fjermedal, 1895, mac...",Cyborg (positive),2
275,4fd190318eb7c8105d696da7,NaT,Technology; Science; Week in Review,WHEN the computer scientist John McCarthy coin...,IDEAS AND TRENDS: Can Machines Learn to Think?...,0,2.888889,4.555556,AI companies failing,science and technology development suppo...,0,"[computer, scientist, john, mccarthy, coined, ...",Cyborg (positive),1


In [71]:
data_melt_nonzero['Sentiment_Value'].value_counts().tail(10)

Sentiment_Value
12    5
9     4
10    4
13    3
16    2
15    2
19    1
11    1
18    1
14    1
Name: count, dtype: int64

In [72]:
# The article with a sentiment 'Value' of 14 is heavily associated with 'Military (negative)'
data_melt_nonzero[data_melt_nonzero['Sentiment_Value'] == 14]

Unnamed: 0,Article ID,Article Date,NYT section,Paragraph,Title,Fiction,AI Mood,AI Relevance,Other (negative),Other (positive),Article Year,Processed_Paragraph,Sentiment_Type,Sentiment_Value
39653,54626c6e79881072f4f730ac,NaT,Science,Warfare is increasingly guided by software. To...,Fearing Bombs That Can Pick Whom to Kill,0,2.444444,4.833333,,warfare,0,"[warfare, increasingly, guided, software, toda...",Military (negative),14


In [80]:

# Create word count dataframe with 'Article ID', 'Article Date', 'Word', 'Count', 'NYT Section', 'Title', 'AI Mood', 'AI Relevance', 'Sentiment (Positive/Negative)', 'Value'
# word_count_df = pd.DataFrame(columns=['Article ID', 'Article Date', 'Word', 'Count', 'NYT Section', 'Title', 'AI Mood', 'AI Relevance', 'Sentiment (Positive/Negative)', 'Value'])


# expand 'Processed_Paragraph' column list entries into a row for each word
data_word_expand = data_melt_nonzero.explode('Processed_Paragraph')
# reset index
data_word_expand.reset_index(drop=True, inplace=True)
# rename 'Processed_Paragraph' column to 'Word'
data_word_expand.rename(columns={'Processed_Paragraph': 'Word'}, inplace=True)
# drop rows with NaN values
# data_word_expand.dropna(inplace=True)
# add count value from positive_word_counts and negative_word_counts to data_word_expand dataframe for each word and sentiment type
data_word_expand['Count'] = data_word_expand.apply(lambda row: positive_word_counts[row['Word']] if row['Sentiment_Type'] in positive_columns else negative_word_counts[row['Word']], axis=1)
# reset index
data_word_expand.reset_index(drop=True, inplace=True)

In [74]:
data_word_expand.shape

(493350, 15)

In [85]:
# print columns of data_word_expand in single line with '' around col name
print(' '.join([f"'{col}'," for col in data_word_expand.columns.tolist()]))

'Article ID', 'Article Date', 'NYT section', 'Paragraph', 'Title', 'Fiction', 'AI Mood', 'AI Relevance', 'Other (negative)', 'Other (positive)', 'Article Year', 'Word', 'Sentiment_Type', 'Sentiment_Value', 'Count',


In [90]:
# drop unnecessary columns in data_word_expand - only keep 'Article ID', 'Word' 'Sentiment_Type', 'AI Mood', 'Sentiment_Value', 'AI Relevance', 'Count'
data_word_expand_opt = data_word_expand.drop(['Article Date', 'Article Year', 'NYT section', 'Paragraph', 'Title', 'Fiction', 'Other (negative)', 'Other (positive)'], axis=1)

data_word_expand_opt.head()

Unnamed: 0,Article ID,AI Mood,AI Relevance,Word,Sentiment_Type,Sentiment_Value,Count
0,4fd156a18eb7c8105d63aed2,4.333333,5.0,robot,Cyborg (positive),2,2084
1,4fd156a18eb7c8105d63aed2,4.333333,5.0,must,Cyborg (positive),2,65
2,4fd156a18eb7c8105d63aed2,4.333333,5.0,keyed,Cyborg (positive),2,1
3,4fd156a18eb7c8105d63aed2,4.333333,5.0,single,Cyborg (positive),2,25
4,4fd156a18eb7c8105d63aed2,4.333333,5.0,individuals,Cyborg (positive),2,9


In [76]:
# data_word_expand[data_word_expand['Word'] == 'robot'].sort_values(by='Article ID', ascending=False).head(6)

In [91]:
data_word_expand_opt.to_csv('data/data_concat_word_count.csv')

# Output Formatted Data

In [82]:
# output to 'data/ai_concat_pivot.csv'
# data_melt_nonzero_op = data_melt_nonzero.drop(['Processed_Paragraph'], axis=1)
output_path = 'data/ai_concat_pivot.csv'
data_melt_nonzero_op.to_csv(output_path)

In [None]:
# Create word count dataset from 'Processed_Paragraph'


In [None]:
data_concat = pd.read_csv(output_path)
data_concat.head()

Unnamed: 0.1,Unnamed: 0,Article ID,Article Date,NYT section,Paragraph,Title,Fiction,AI Mood,AI Relevance,Other (negative),Other (positive),Article Year,Processed_Paragraph,Sentiment_Type,Value
0,16,4fd156a18eb7c8105d63aed2,1986-08-10 00:00:00+00:00,U.S.,The robot must be keyed to a single individual...,"TO ASSIST HANDICAPPED, A ROBOT THAT CAN HEAR",0,4.333333,5.0,,could be the tool that a handicapped person ...,1986.0,"['robot', 'must', 'keyed', 'single', 'individu...",Cyborg (positive),2
1,115,4fd16d848eb7c8105d660007,,U.S.,"The F.B.I. is enthusiastic about Big Floyd, wh...",'BIG FLOYD' JOINS THE FORCE,0,3.666667,5.0,,Criminal Investigation Lesser Crimes diffe...,,"['fbi', 'enthusiastic', 'big', 'floyd', 'whose...",Cyborg (positive),1
2,125,4fd1707b8eb7c8105d66317b,1986-08-10 00:00:00+00:00,Arts,"''Condor,'' on the other hand, tries to be amu...",WHEN THE SLUSH PILE COMES TO LIFE,1,3.0,3.333333,,,1986.0,"['condor', 'hand', 'tries', 'amusing', 'after'...",Cyborg (positive),1
3,162,4fd1781e8eb7c8105d66ef93,,Business,"''The Tomorrow Makers,'' by Grant Fjermedal ($...",HOW TO AVOID TUNNEL VISION,0,3.111111,4.666667,AI minds are wildly different from hum...,robotic immortality,,"['tomorrow', 'makers', 'grant', 'fjermedal', '...",Cyborg (positive),2
4,275,4fd190318eb7c8105d696da7,,Technology; Science; Week in Review,WHEN the computer scientist John McCarthy coin...,IDEAS AND TRENDS: Can Machines Learn to Think?...,0,2.888889,4.555556,AI companies failing,science and technology development suppo...,,"['computer', 'scientist', 'john', 'mccarthy', ...",Cyborg (positive),1
