In [50]:
import pandas as pd

# Load Dataset

In [51]:
# Load the dataset
file_path = "data/ai-perception.csv"
data = pd.read_csv(file_path)

# Preprocess Data

In [52]:
# Investigating the unique values in the "Other (negative)" and "Other (positive)" columns
other_negative_unique_values = data['Other (negative)'].unique()
other_positive_unique_values = data['Other (positive)'].unique()

In [53]:
# drop rows where we have an unexpected value in 'Paragraph number'
non_integer_paragraph_numbers = data['Paragraph number'].loc[
    ~data['Paragraph number'].apply(lambda x: x.isdigit())
].unique()
data = data.loc[
    ~data['Paragraph number'].isin(non_integer_paragraph_numbers)
]

# or instead, replace the non-integer values with -1 in the "Paragraph number" column - 12 non-unique entries are affected
# data['Paragraph number'] = data['Paragraph number'].apply(lambda x: int(x) if x.isdigit() else -1)

In [54]:
# Replacing the placeholder '{}' with an empty string in the "Other (negative)" and "Other (positive)" columns
data['Other (negative)'] = data['Other (negative)'].replace('{}', '')
data['Other (positive)'] = data['Other (positive)'].replace('{}', '')

# Converting the "Paragraph number" column to an integer data type
data['Paragraph number'] = data['Paragraph number'].astype(int)

# Aggregate Data

In [55]:
# Columns to sum (0/1 values)
sum_columns = [
    'Fiction', 'Cyborg (positive)', 'Decisions (positive)', 'Education (positive)', 'Entertain (positive)', 'Healthcare (positive)', 'Singularity (positive)',
    'Transportation (positive)', 'Work (positive)', 'Controling AI (negative)', 'Cyborg (negative)', 'Ethics (negative)', 'Military (negative)', 'Progress (negative)', 'Singularity (negative)', 'Work (negative)'
]

# Columns to average (0-5 rated entries)
average_columns = ['AI Mood', 'AI Relevance']

# Columns to concatenate/join (text columns)
concat_columns = ['Other (negative)', 'Other (positive)']

# Columns to keep the first entry
first_columns = ['Article Date', 'NYT section', 'Title']

# First, we group by ('Article ID', 'Paragraph number') pairs, and keep only unique paragraphs for each pair
# Also apply the other aggregation functions for the relevant columns here
paragraph_grouped_data = data.groupby(['Article ID', 'Paragraph number']).agg({
    'Paragraph': 'first',  # Keep only the first paragraph for each (Article ID, Paragraph number) pair
    **{col: 'sum' for col in sum_columns},
    **{col: 'mean' for col in average_columns},
    **{col: lambda x: ' '.join(x.dropna().astype(str)) for col in concat_columns},
    **{col: 'first' for col in first_columns}
}).reset_index()

In [56]:
# Group by 'Article ID' and concatenate paragraphs into a single row
final_data = paragraph_grouped_data.groupby('Article ID').agg({
    'Article Date': 'first',
    'NYT section': 'first',
    'Paragraph': ' '.join,
    'Title': 'first',
    **{col: 'sum' for col in sum_columns},
    **{col: 'mean' for col in average_columns},
    **{col: ' '.join for col in concat_columns} # or try with 'first'
}).reset_index()

In [57]:
# Display first few rows of the aggregated dataset
final_data.head()

Unnamed: 0,Article ID,Article Date,NYT section,Paragraph,Title,Fiction,Cyborg (positive),Decisions (positive),Education (positive),Entertain (positive),...,Cyborg (negative),Ethics (negative),Military (negative),Progress (negative),Singularity (negative),Work (negative),AI Mood,AI Relevance,Other (negative),Other (positive)
0,4fd100e58eb7c8105d5bbb33,2012-04-01 00:00:00 UTC,Arts; Style; Magazine,9. The robot designs woven into these tea towe...,Stijl Council,0,0,0,0,0,...,0,0,0,0,0,0,3.0,2.333333,,
1,4fd100e88eb7c8105d5bbd2d,,Science; Health,"In 1818, Mary Shelley's ''Frankenstein'' raise...",Statues to Golems to R2-D2,2,0,0,0,0,...,0,1,2,0,1,0,2.333333,5.0,,
2,4fd14a668eb7c8105d627c40,,Business; Opinion,Mr. Culbertson would use ''powerful new tools'...,ECONOMIC POLICY,0,0,2,0,0,...,0,0,0,0,0,0,4.0,4.333333,,manage entire economies AI and computers could...
3,4fd14a678eb7c8105d627d24,1986-02-16 00:00:00 UTC,Business,"Some golfer. Fortunately, you'll never have to...",IN PURSUIT OF THE PERFECT GOLF BALL,0,0,0,0,0,...,0,0,0,0,0,0,3.0,4.0,,
4,4fd14a678eb7c8105d627d35,1986-02-16 00:00:00 UTC,U.S.,All week long Navy divers and salvage experts ...,LATEST PICTURES FROM NASA SHOW WODER FIRE ON B...,0,0,0,0,0,...,0,0,0,0,0,0,3.0,3.333333,,


# Verify Process

In [58]:
# check if each 'Article ID' has only one row entry
assert final_data['Article ID'].value_counts().max() == 1

In [59]:
# Find an article with multiple paragraphs related to the same article ID
article_id_multiple = data['Article ID'].value_counts().head(1).index[0]
article_id_multiple

'54eae4ca7988102c57355c06'

In [45]:
data[data['Article ID'] == article_id_multiple]['Paragraph'].unique()

array(['Cognoscenti including Stephen Hawking, Elon Musk and Bill Gates, among others, have recently weighed in on its potential and perils. After reading Nick Bostrom’s book “Superintelligence,” Musk even wondered aloud if A.I. may be “our biggest existential threat.”',
       'Positions on A.I. are split, and not just on its dangers. Some insist that “hard A.I.” (with human-level intelligence) can never exist, while others conclude that it is inevitable. But in many cases these debates may be missing the real point of what it means to live and think with forms of synthetic intelligence very different from our own.',
       'That point, in short, is that a mature A.I. is not necessarily a humanlike intelligence, or one that is at our disposal. If we look for A.I. in the wrong ways, it may emerge in forms that are needlessly difficult to recognize, amplifying its risks and retarding its benefits.',
       'This is not just a concern for the future. A.I. is already out of the lab and de

In [60]:
final_data[final_data['Article ID'] == article_id_multiple]['Paragraph'].values

array(['Cognoscenti including Stephen Hawking, Elon Musk and Bill Gates, among others, have recently weighed in on its potential and perils. After reading Nick Bostrom’s book “Superintelligence,” Musk even wondered aloud if A.I. may be “our biggest existential threat.” Positions on A.I. are split, and not just on its dangers. Some insist that “hard A.I.” (with human-level intelligence) can never exist, while others conclude that it is inevitable. But in many cases these debates may be missing the real point of what it means to live and think with forms of synthetic intelligence very different from our own. That point, in short, is that a mature A.I. is not necessarily a humanlike intelligence, or one that is at our disposal. If we look for A.I. in the wrong ways, it may emerge in forms that are needlessly difficult to recognize, amplifying its risks and retarding its benefits. This is not just a concern for the future. A.I. is already out of the lab and deep into the fabric of things. 

In [63]:
final_data.shape[0]

3365

In [62]:
data['Article ID'].unique().shape[0]

3365

[x] Passes Validity Checks

In [64]:
# save file path 
save_file_path = "data/ai-perception-concat-paragraphs.csv"
# save final_data results to csv
final_data.to_csv(save_file_path, index=False)

In [65]:
final_data

Unnamed: 0,Article ID,Article Date,NYT section,Paragraph,Title,Fiction,Cyborg (positive),Decisions (positive),Education (positive),Entertain (positive),...,Cyborg (negative),Ethics (negative),Military (negative),Progress (negative),Singularity (negative),Work (negative),AI Mood,AI Relevance,Other (negative),Other (positive)
0,4fd100e58eb7c8105d5bbb33,2012-04-01 00:00:00 UTC,Arts; Style; Magazine,9. The robot designs woven into these tea towe...,Stijl Council,0,0,0,0,0,...,0,0,0,0,0,0,3.000000,2.333333,,
1,4fd100e88eb7c8105d5bbd2d,,Science; Health,"In 1818, Mary Shelley's ''Frankenstein'' raise...",Statues to Golems to R2-D2,2,0,0,0,0,...,0,1,2,0,1,0,2.333333,5.000000,,
2,4fd14a668eb7c8105d627c40,,Business; Opinion,Mr. Culbertson would use ''powerful new tools'...,ECONOMIC POLICY,0,0,2,0,0,...,0,0,0,0,0,0,4.000000,4.333333,,manage entire economies AI and computers could...
3,4fd14a678eb7c8105d627d24,1986-02-16 00:00:00 UTC,Business,"Some golfer. Fortunately, you'll never have to...",IN PURSUIT OF THE PERFECT GOLF BALL,0,0,0,0,0,...,0,0,0,0,0,0,3.000000,4.000000,,
4,4fd14a678eb7c8105d627d35,1986-02-16 00:00:00 UTC,U.S.,All week long Navy divers and salvage experts ...,LATEST PICTURES FROM NASA SHOW WODER FIRE ON B...,0,0,0,0,0,...,0,0,0,0,0,0,3.000000,3.333333,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3360,571a27d979881062cbad9933,,Technology,Even IBM counted on its software being consume...,The No-Good Week in Tech’s (Hopeful) Trip to a...,0,0,0,0,0,...,0,0,0,0,0,0,3.333333,3.666667,,
3361,5720a4577988102dc25cec48,2016-04-28 00:00:00 UTC,World,"As for Po himself, there is Xian۪er, the two-f...","A Robot Monk Captivates China, Mixing Spiritua...",1,1,1,1,2,...,0,0,0,0,0,0,3.375000,4.208333,inherent limitations of AI,spiritual advice optimism ab...
3362,57211b99798810622b5ab8ad,,U.S.,"This includes 3D printing, artificial intellig...",Transcript: Donald Trump’s Foreign Policy Speech,0,0,0,0,0,...,0,0,0,0,0,0,3.000000,2.666667,,
3363,5721c71f798810622b5abbb9,2016-04-28 04:15:59 UTC,Education,Watch the video above. What questions would yo...,News Q&#8217;s | A Robot Monk Captivates China...,0,1,0,1,0,...,0,0,0,0,0,0,3.000000,3.625000,,religious application
