In [64]:
import pandas as pd
# import string

## Ideas

1. Weight by WorkTimeInSeconds feature to more heavily weight the workers who spent more time rating articles/paragraphs.

In [65]:
# Load the dataset
file_path = "data/ai-perception.csv"
data = pd.read_csv(file_path)

In [66]:
data.head()

Unnamed: 0,Article ID,Article Date,Paragraph number,NYT section,Paragraph,Title,WorkTimeInSeconds,AI Mood,AI Relevance,Fiction,...,Other (negative),Cyborg (positive),Decisions (positive),Education (positive),Entertain (positive),Healthcare (positive),Singularity (positive),Transportation (positive),Work (positive),Other (positive)
0,4fd1cbc98eb7c8105d701286,1996-10-06 00:00:00 UTC,18,New York and Region,"Thus, next weekend will feature the robot who ...",LONG ISLAND JOURNAL,1472,4,5,0,...,{},0,0,0,0,0,0,0,0,{}
1,4fd1cbc98eb7c8105d701286,1996-10-06 00:00:00 UTC,18,New York and Region,"Thus, next weekend will feature the robot who ...",LONG ISLAND JOURNAL,49,4,5,0,...,{},0,0,0,0,0,0,0,0,{}
2,4fd1cbc98eb7c8105d701286,1996-10-06 00:00:00 UTC,18,New York and Region,"Thus, next weekend will feature the robot who ...",LONG ISLAND JOURNAL,66,5,5,0,...,{},0,0,0,1,0,0,0,0,{}
3,54b0793b7988100e21965770,2006-07-31 00:00:00 UTC,16,Technology,That phrase was coined in the 1970۪s by Masahi...,Camera System Creates Sophisticated 3-D Effects,3053,3,4,0,...,{},0,0,0,0,0,0,0,0,{}
4,54b0793b7988100e21965770,2006-07-31 00:00:00 UTC,16,Technology,That phrase was coined in the 1970۪s by Masahi...,Camera System Creates Sophisticated 3-D Effects,25,3,4,0,...,{},0,0,0,0,0,0,0,0,{}


In [92]:
data.dtypes

Article ID                   object
Article Date                 object
Paragraph number             object
NYT section                  object
Paragraph                    object
Title                        object
WorkTimeInSeconds             int64
AI Mood                       int64
AI Relevance                  int64
Fiction                       int64
Controling AI (negative)      int64
Cyborg (negative)             int64
Ethics (negative)             int64
Military (negative)           int64
Progress (negative)           int64
Singularity (negative)        int64
Work (negative)               int64
Other (negative)             object
Cyborg (positive)             int64
Decisions (positive)          int64
Education (positive)          int64
Entertain (positive)          int64
Healthcare (positive)         int64
Singularity (positive)        int64
Transportation (positive)     int64
Work (positive)               int64
Other (positive)             object
dtype: object

In [67]:
# set '{}' entries of Other (positive) and Other (negative) to empty string ''
data.loc[data['Other (positive)'] == '{}', 'Other (positive)'] = ''
data.loc[data['Other (negative)'] == '{}', 'Other (negative)'] = ''

In [68]:
data.columns

Index(['Article ID', 'Article Date', 'Paragraph number', 'NYT section',
       'Paragraph', 'Title', 'WorkTimeInSeconds', 'AI Mood', 'AI Relevance',
       'Fiction', 'Controling AI (negative)', 'Cyborg (negative)',
       'Ethics (negative)', 'Military (negative)', 'Progress (negative)',
       'Singularity (negative)', 'Work (negative)', 'Other (negative)',
       'Cyborg (positive)', 'Decisions (positive)', 'Education (positive)',
       'Entertain (positive)', 'Healthcare (positive)',
       'Singularity (positive)', 'Transportation (positive)',
       'Work (positive)', 'Other (positive)'],
      dtype='object')

In [69]:
# Define columns representing positive and negative associations
positive_columns = ['Cyborg (positive)', 'Decisions (positive)', 'Education (positive)', 'Entertain (positive)', 'Healthcare (positive)', 'Singularity (positive)', 'Transportation (positive)', 'Work (positive)']
negative_columns = ['Controling AI (negative)', 'Cyborg (negative)', 'Ethics (negative)', 'Military (negative)', 'Progress (negative)', 'Singularity (negative)', 'Work (negative)']

# Define remaining columns
all_columns = data.columns.tolist()
remaining_columns = [col for col in all_columns if col not in positive_columns and col not in negative_columns]
remaining_columns

['Article ID',
 'Article Date',
 'Paragraph number',
 'NYT section',
 'Paragraph',
 'Title',
 'WorkTimeInSeconds',
 'AI Mood',
 'AI Relevance',
 'Fiction',
 'Other (negative)',
 'Other (positive)']

In [76]:
# 1. aggregate multiple worker entries with same (Article ID, Paragraph number) pair into single entry
# sum sentiment columns that have 0/1 entries such as Fiction, Cyborg (negative), and Work (positive)...
# average numerical cols with 0-5 rated entries such as AI Mood and AI Relevance
# concatenate text columns such as Other (negative) and Other (positive)

aggfuncs = {
    "Article Date": "first", 
    "NYT section": "first",
    "Title": "first",
    "Fiction": "sum",
    "AI Mood": "mean",
    "AI Relevance": "mean",
    "Other (negative)": lambda x: " ".join(x),
    "Other (positive)": lambda x: " ".join(x),
}
# add positive and negative columns to aggfuncs with sum
for col in positive_columns:
    aggfuncs[col] = "sum"
for col in negative_columns:
    aggfuncs[col] = "sum"

# "Article Date", "NYT section", "Title" - "Paragraph"
data_agg = data.groupby(["Article ID", "Paragraph"], as_index=False).agg(aggfuncs)
print(data_agg.shape)
data_agg.head()

(5094, 25)


Unnamed: 0,Article ID,Paragraph,Article Date,NYT section,Title,Fiction,AI Mood,AI Relevance,Other (negative),Other (positive),...,Singularity (positive),Transportation (positive),Work (positive),Controling AI (negative),Cyborg (negative),Ethics (negative),Military (negative),Progress (negative),Singularity (negative),Work (negative)
0,4fd100e58eb7c8105d5bbb33,9. The robot designs woven into these tea towe...,2012-04-01 00:00:00 UTC,Arts; Style; Magazine,Stijl Council,0,3.0,2.333333,,,...,0,0,0,0,0,0,0,0,0,0
1,4fd100e88eb7c8105d5bbd2d,"In 1818, Mary Shelley's ''Frankenstein'' raise...",,Science; Health,Statues to Golems to R2-D2,2,2.333333,5.0,,,...,1,0,0,1,0,1,2,0,1,0
2,4fd14a668eb7c8105d627c40,Mr. Culbertson would use ''powerful new tools'...,,Business; Opinion,ECONOMIC POLICY,0,4.0,4.333333,,manage entire economies AI and computers could...,...,0,0,1,0,0,0,0,0,0,0
3,4fd14a678eb7c8105d627d24,"Some golfer. Fortunately, you'll never have to...",1986-02-16 00:00:00 UTC,Business,IN PURSUIT OF THE PERFECT GOLF BALL,0,3.0,4.0,,,...,0,0,0,0,0,0,0,0,0,0
4,4fd14a678eb7c8105d627d35,All week long Navy divers and salvage experts ...,1986-02-16 00:00:00 UTC,U.S.,LATEST PICTURES FROM NASA SHOW WODER FIRE ON B...,0,3.0,3.333333,,,...,0,1,0,0,0,0,0,0,0,0


In [78]:
# count number of entries for each unique Article ID
data_agg['Article ID'].value_counts()

Article ID
556f9ded7988101ccffbbdc9             24
548cfa86798810543aba4e42             21
560375947988107f8592ba01             20
4fd2571d8eb7c8105d801549             17
56e6b84c79881050bc5d950e             17
                                     ..
4fd250978eb7c8105d7f3f3d              1
4fd250988eb7c8105d7f3fc8              1
4fd251138eb7c8105d7f55ec              1
4fd251808eb7c8105d7f6275              1
Space Shuttle Atlantis Blasts Off     1
Name: count, Length: 3366, dtype: int64

In [63]:
# Check entries where Article ID is 54eae4ca7988102c57355c06
# data_agg[data_agg['Article ID'] == '54eae4ca7988102c57355c06']

In [84]:
# concatenate separate paragraphs of a shared Article ID into one row/string - including other columns
aggfuncs = {
    "Article Date": "first", 
    "NYT section": "first",
    "Title": "first",
    "Paragraph": lambda x: "".join(str(x)),
    "Fiction": "sum",
    "AI Mood": "mean",
    "AI Relevance": "mean",
    "Other (negative)": lambda x: " ".join(x),
    "Other (positive)": lambda x: " ".join(x),
}
# add positive and negative columns to aggfuncs with sum
for col in positive_columns:
    aggfuncs[col] = "sum"
for col in negative_columns:
    aggfuncs[col] = "sum"

# "Article Date", "NYT section", "Title" - "Paragraph"
data_agg_paragraphs = data.groupby(["Article ID"], as_index=False).agg(aggfuncs)

#data_agg_paragraphs = data_agg.groupby(['Article ID'])['Paragraph'].apply(lambda x: ' '.join(x)).reset_index()
print(data_agg_paragraphs.shape)
data_agg_paragraphs.tail()

(3367, 25)


Unnamed: 0,Article ID,Article Date,NYT section,Title,Paragraph,Fiction,AI Mood,AI Relevance,Other (negative),Other (positive),...,Singularity (positive),Transportation (positive),Work (positive),Controling AI (negative),Cyborg (negative),Ethics (negative),Military (negative),Progress (negative),Singularity (negative),Work (negative)
3362,57211b99798810622b5ab8ad,,U.S.,Transcript: Donald Trump’s Foreign Policy Speech,"9144 This includes 3D printing, artificial ...",0,3.0,2.666667,,,...,0,0,0,0,0,0,0,0,0,0
3363,5721c71f798810622b5abbb9,2016-04-28 04:15:59 UTC,Education,News Q&#8217;s | A Robot Monk Captivates China...,780 6. Do you think a robot monk can ever ...,0,3.0,3.625,,religious application,...,0,0,0,0,0,0,0,0,0,0
3364,572350e37988101b346ef18a,2016-04-30 00:00:00 UTC,Your Money,The Pros and Cons of Using a Robot as an Inves...,"5997 Kara M. Stein, a commissioner at the S...",0,2.666667,4.0,,,...,0,0,1,0,0,0,0,0,0,1
3365,Space Shuttle Atlantis Blasts Off,18,2008-02-07T00:00:00Z,,3417 Science\n3418 Science\n3419 Scie...,0,3.0,1.333333,,,...,0,0,0,0,0,0,0,0,0,0
3366,{},{},,"he preferences of 12-year-old boys, ''Ultravio...",2814 NaN\n2815 NaN\n2816 NaN\n5343 ...,3,3.166667,1.333333,,,...,0,0,0,0,0,0,0,0,0,0


In [89]:
data_agg_paragraphs[data_agg_paragraphs['Article ID'] == '4fd100e88eb7c8105d5bbd2d']

Unnamed: 0,Article ID,Article Date,NYT section,Title,Paragraph,Fiction,AI Mood,AI Relevance,Other (negative),Other (positive),...,Singularity (positive),Transportation (positive),Work (positive),Controling AI (negative),Cyborg (negative),Ethics (negative),Military (negative),Progress (negative),Singularity (negative),Work (negative)
1,4fd100e88eb7c8105d5bbd2d,,Science; Health,Statues to Golems to R2-D2,"13650 In 1818, Mary Shelley's ''Frankenstei...",2,2.333333,5.0,,,...,1,0,0,1,0,1,2,0,1,0


In [91]:
data[data['Article ID'] == '4fd100e88eb7c8105d5bbd2d']

Unnamed: 0,Article ID,Article Date,Paragraph number,NYT section,Paragraph,Title,WorkTimeInSeconds,AI Mood,AI Relevance,Fiction,...,Other (negative),Cyborg (positive),Decisions (positive),Education (positive),Entertain (positive),Healthcare (positive),Singularity (positive),Transportation (positive),Work (positive),Other (positive)
13650,4fd100e88eb7c8105d5bbd2d,,4,Science; Health,"In 1818, Mary Shelley's ''Frankenstein'' raise...",Statues to Golems to R2-D2,118,3,5,1,...,,0,0,0,0,0,1,0,0,
13651,4fd100e88eb7c8105d5bbd2d,,4,Science; Health,"In 1818, Mary Shelley's ''Frankenstein'' raise...",Statues to Golems to R2-D2,520,2,5,1,...,,0,0,0,0,0,0,0,0,
13652,4fd100e88eb7c8105d5bbd2d,,4,Science; Health,"In 1818, Mary Shelley's ''Frankenstein'' raise...",Statues to Golems to R2-D2,53,2,5,0,...,,0,0,0,0,0,0,0,0,


In [85]:
data_agg_paragraphs['Article ID'].value_counts()

Article ID
4fd100e58eb7c8105d5bbb33    1
52b856de798810193ff56fa1    1
529230af798810698a3301de    1
529764987988100fb72064b6    1
5298ca3679881047430cda04    1
                           ..
4fd239b98eb7c8105d7cd78c    1
4fd239bf8eb7c8105d7cdb58    1
4fd23a138eb7c8105d7cdf3c    1
4fd23a658eb7c8105d7ceb1b    1
{}                          1
Name: count, Length: 3367, dtype: int64