In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string

### Data Loading

In [2]:
# Load the dataset
file_path = "data/ai-perception.csv"
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
data.head()

Unnamed: 0,Article ID,Article Date,Paragraph number,NYT section,Paragraph,Title,WorkTimeInSeconds,AI Mood,AI Relevance,Fiction,...,Other (negative),Cyborg (positive),Decisions (positive),Education (positive),Entertain (positive),Healthcare (positive),Singularity (positive),Transportation (positive),Work (positive),Other (positive)
0,4fd1cbc98eb7c8105d701286,1996-10-06 00:00:00 UTC,18,New York and Region,"Thus, next weekend will feature the robot who ...",LONG ISLAND JOURNAL,1472,4,5,0,...,{},0,0,0,0,0,0,0,0,{}
1,4fd1cbc98eb7c8105d701286,1996-10-06 00:00:00 UTC,18,New York and Region,"Thus, next weekend will feature the robot who ...",LONG ISLAND JOURNAL,49,4,5,0,...,{},0,0,0,0,0,0,0,0,{}
2,4fd1cbc98eb7c8105d701286,1996-10-06 00:00:00 UTC,18,New York and Region,"Thus, next weekend will feature the robot who ...",LONG ISLAND JOURNAL,66,5,5,0,...,{},0,0,0,1,0,0,0,0,{}
3,54b0793b7988100e21965770,2006-07-31 00:00:00 UTC,16,Technology,That phrase was coined in the 1970۪s by Masahi...,Camera System Creates Sophisticated 3-D Effects,3053,3,4,0,...,{},0,0,0,0,0,0,0,0,{}
4,54b0793b7988100e21965770,2006-07-31 00:00:00 UTC,16,Technology,That phrase was coined in the 1970۪s by Masahi...,Camera System Creates Sophisticated 3-D Effects,25,3,4,0,...,{},0,0,0,0,0,0,0,0,{}


### Text Preprocessing
This step will include tokenizing the paragraphs, removing stop words, and applying stemming to the words. We will then categorize the words into positive and negative associations based on the specified columns.

In [3]:
# Importing nltk resources for text preprocessing
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/wpm/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/wpm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# Define common English stop words - set class data type
stop_words = {'thus', 'the', 'and', 'to', 'of', 'a', 'in', 'that', 'is', 'was', 'he', 'for', 'it', 'with', 'as', 'his', 'on', 'be', 'at', 'by', 'i', 'this', 'had', 'not', 'but', 'from', 'or', 'have', 'an', 'they', 'which', 'you', 'were', 'her', 'their', 'we'}
print("Initial stop words:", len(stop_words))

Initial stop words: 36


In [5]:
# load frequent words dataset - columns are 'word' and 'count'
freq_words = pd.read_csv('data/unigram_freq.csv')

# add 1 and 2 letter words to the stop_words set if they are not already there
for i in range(len(freq_words['word'])):
    word = str(freq_words['word'][i])
    if len(word) <= 2:
        stop_words.add(word)

print("Stop words:", len(stop_words))

Stop words: 722


In [6]:
# add stop words with count more than 550000000
for i in range(len(freq_words['word'])):
    word = str(freq_words['word'][i])
    if freq_words['count'][i] > 550000000:
        stop_words.add(word)

print("Stop words:", len(stop_words))

Stop words: 767


In [7]:
# create pandas series with words with count less than 100,000
less_freq_words = freq_words[freq_words['count'] < 100000]['word'].tail(100000).tolist()
# add less_freq_words to stop_words set
stop_words = stop_words.union(set(less_freq_words))
print("Stop words:", len(stop_words))

Stop words: 100767


In [8]:
# # Function to tokenize text (split by spaces and remove punctuation)
def custom_tokenize(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize by spaces
    tokens = text.lower().split()
    return tokens

# Function to preprocess text (tokenize and remove stop words)
# def custom_preprocess_text(text):
#     # Tokenize the text
#     tokens = custom_tokenize(text)
#     # Remove stop words
#     tokens = [word for word in tokens if word not in stop_words]
#     return tokens

In [9]:
# Updated function to preprocess text (tokenize and remove stop words) with handling for missing or non-string values
def custom_preprocess_text(text):
    if pd.isnull(text) or not isinstance(text, str):
        return []
    
    # Tokenize the text
    tokens = custom_tokenize(text)
    # Remove stop words
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

In [10]:
# Convert the 'Article Date' column to datetime, coercing errors
data['Article Date'] = pd.to_datetime(data['Article Date'], errors='coerce')

# Apply the custom preprocessing function to the 'Paragraph' column
data['Processed_Paragraph'] = data['Paragraph'].apply(custom_preprocess_text)

# Show the processed paragraphs in the first few rows
data[['Paragraph', 'Processed_Paragraph']].head()

Unnamed: 0,Paragraph,Processed_Paragraph
0,"Thus, next weekend will feature the robot who ...","[next, weekend, feature, robot, named, sico, p..."
1,"Thus, next weekend will feature the robot who ...","[next, weekend, feature, robot, named, sico, p..."
2,"Thus, next weekend will feature the robot who ...","[next, weekend, feature, robot, named, sico, p..."
3,That phrase was coined in the 1970۪s by Masahi...,"[phrase, coined, 1970۪s, masahiro, mori, japan..."
4,That phrase was coined in the 1970۪s by Masahi...,"[phrase, coined, 1970۪s, masahiro, mori, japan..."


## Task 1: Word Associations (Positive and Negative)

To find words with the most positive and negative associations, we analyze the processed paragraphs in conjunction with the positive and negative columns in the dataset. We create functions to aggregate the word counts based on these associations.

In [11]:
# Function to count words with positive and negative associations
def count_associations(row):
    # Extract processed paragraph
    words = row['Processed_Paragraph']
    # Initialize counters for positive and negative associations
    positive_counter = Counter()
    negative_counter = Counter()
    # Iterate through words and update counters based on associations in the row
    for word in words:
        if any(row[col] > 0 for col in positive_columns):
            positive_counter[word] += 1
        if any(row[col] > 0 for col in negative_columns):
            negative_counter[word] += 1
    return positive_counter, negative_counter

In [12]:
# Define columns representing positive and negative associations
positive_columns = ['Cyborg (positive)', 'Decisions (positive)', 'Education (positive)', 'Entertain (positive)', 'Healthcare (positive)', 'Singularity (positive)', 'Transportation (positive)', 'Work (positive)']
negative_columns = ['Controling AI (negative)', 'Cyborg (negative)', 'Ethics (negative)', 'Military (negative)', 'Progress (negative)', 'Singularity (negative)', 'Work (negative)']

# Define remaining columns
all_columns = data.columns.tolist()
remaining_columns = [col for col in all_columns if col not in positive_columns and col not in negative_columns]

In [13]:
# Apply the function to count word associations
positive_word_counts = Counter()
negative_word_counts = Counter()
for _, row in data.iterrows():
    positive, negative = count_associations(row)
    positive_word_counts += positive
    negative_word_counts += negative

In [14]:
# print length of positive_word_counts and negative_word_counts dictionaries
print('positive word count:', len(positive_word_counts))
print('negative word count:', len(negative_word_counts))
# check if 'robot' is contained in negative_word_counts
print('robot in positive words:', 'robot' in positive_word_counts, '\nrobot in negative words:', 'robot' in negative_word_counts)

positive word count: 17720
negative word count: 10571
robot in positive words: True 
robot in negative words: True


In [15]:
print('robot (positive):', positive_word_counts['robot'])
print('robot (negative):', negative_word_counts['robot'])

robot (positive): 2169
robot (negative): 465


In [16]:
# Get top 10 positive and negative words
top_positive_words = positive_word_counts.most_common(10)
top_negative_words = negative_word_counts.most_common(10)

# top_positive_words, top_negative_words
print('positive:', top_positive_words, '\nnegative:', top_negative_words)

positive: [('robot', 2169), ('intelligence', 1915), ('artificial', 1876), ('its', 953), ('like', 781), ('said', 756), ('computer', 656), ('human', 603), ('could', 556), ('robots', 540)] 
negative: [('intelligence', 1110), ('artificial', 1047), ('human', 470), ('robot', 465), ('computer', 348), ('—', 345), ('its', 318), ('said', 284), ('like', 272), ('robots', 241)]


In [None]:
"""
We want to output the positive and negative words in a new dataset in CSV form with the below structure. There should be an entry for each of the instances of a word in a paragraph from the original articles.
Article ID	Word	Total Frequency	Sentiment (Positive/Negative)
1234	Robot	200	Positive
2345	Robot	345	Negative
3456	Robot	345	Negative
"""
# Create a new dataframe to hold the results
results = pd.DataFrame(columns=['Article ID', 'Word', 'Total Frequency', 'Sentiment (Positive/Negative)'])


In [18]:
# Pivot the sentiment columns into rows creating a new row for each sentiment that has a 1 entry
data_long = pd.melt(data, id_vars=remaining_columns, value_vars=positive_columns + negative_columns, var_name='Sentiment', value_name='Value')
# drop rows where the sentiment is 0
data_long = data_long[data_long['Value'] != 0]
data_long.head()

Unnamed: 0,Article ID,Article Date,Paragraph number,NYT section,Paragraph,Title,WorkTimeInSeconds,AI Mood,AI Relevance,Fiction,Other (negative),Other (positive),Processed_Paragraph,Sentiment,Value
27,4fd2a5648eb7c8105d88ca51,2004-11-28 00:00:00+00:00,29,Technology; Science; Magazine,"Some of those functions, especially involving ...",A Robot For the Masses,618,5,5,0,{},{},"[some, those, functions, especially, involving...",Cyborg (positive),1
76,4fd1a8b88eb7c8105d6c4d24,1991-11-03 00:00:00+00:00,0,Movies; Arts,Is it progress when a Terminator wants to weep...,Screen Robots Tell a Tale of Mankind,39,3,4,1,{},{},"[progress, terminator, wants, weep, terminator...",Cyborg (positive),1
117,4fd1b1f58eb7c8105d6d47db,1992-04-25 00:00:00+00:00,5,Technology; Business,The new robot gripper solves the problem with ...,Patents; A Simple Method for Robot's Grip,665,5,5,0,{},{},"[robot, gripper, solves, problem, simple, slid...",Cyborg (positive),1
183,4fd3a2858eb7c8105d8ea970,2011-12-25 17:30:32+00:00,4,Opinion,"This is why, in my view, we need to think long...",The Future of Moral Machines,643,5,3,0,{},{},"[why, need, think, long, hard, machine, morali...",Cyborg (positive),1
188,4fd3958e8eb7c8105d8cb8ae,2009-06-17 11:00:42+00:00,0,Health,The genesis of much of the ab work we do these...,Is Your Ab Workout Hurting Your Back?,80,5,5,1,{},{},"[genesis, much, work, these, days, probably, l...",Cyborg (positive),1


In [None]:
melted_data = pd.melt(data, id_vars=remaining_columns, value_vars=positive_columns + negative_columns, var_name='Sentiment', value_name='Value')
# Drop rows where the value is 0
melted_data = melted_data[melted_data['Value'] > 0]
# Drop the value column
melted_data.drop(columns=['Value'], inplace=True)
# Sort by article ID and sentiment
melted_data.sort_values(by=['Article ID', 'Sentiment'], inplace=True)
# Reset the index
melted_data.reset_index(drop=True, inplace=True)

In [75]:
import plotly.express as px
import plotly.graph_objects as go
from dash import dcc, html, Input, Output
import dash

In [76]:
# Create DataFrame for positive words
positive_df = pd.DataFrame(top_positive_words, columns=['Word', 'Count'])
# Create DataFrame for negative words
negative_df = pd.DataFrame(top_negative_words, columns=['Word', 'Count'])

# Generate word clouds
positive_word_cloud = px.treemap(positive_df, path=['Word'], values='Count', title="Top Positive Words")
negative_word_cloud = px.treemap(negative_df, path=['Word'], values='Count', title="Top Negative Words")

In [77]:
# Create bar charts
positive_bar_chart = go.Figure(data=[go.Bar(x=positive_df['Word'], y=positive_df['Count'])])
negative_bar_chart = go.Figure(data=[go.Bar(x=negative_df['Word'], y=negative_df['Count'])])


In [78]:
# Filter data for the 'Health' section
health_data = data[data['NYT section'] == 'Health']
# Count words for the 'Health' section
health_word_counts = Counter()
for words in health_data['Processed_Paragraph']:
    health_word_counts += Counter(words)
# Create a bar chart
health_words_df = pd.DataFrame(health_word_counts.most_common(10), columns=['Word', 'Count'])
health_bar_chart = go.Figure(data=[go.Bar(x=health_words_df['Word'], y=health_words_df['Count'])])


In [1]:
# Initialize the Dash app
app = dash.Dash(__name__)

# Dropdown for word selection
word_dropdown = dcc.Dropdown(
    id='word-dropdown',
    options=[{'label': word, 'value': word} for word in positive_word_counts.keys()],
    value='robot'  # Default value
)

# Function to update word trend chart
@app.callback(
    Output('word-trend-chart', 'figure'),
    Input('word-dropdown', 'value')
)
def update_word_trend(selected_word):
    # Convert the 'Article Date' column to datetime, coercing errors
    # data['Article_Date_DT'] = pd.to_datetime(data['Article Date'], errors='coerce')
    
    # Filter data by the selected word
    word_trend_data = data[data['Processed_Paragraph'].apply(lambda x: selected_word in x)]

    # Extract years and count occurrences
    word_trend_data['Year'] = pd.to_datetime(word_trend_data['Article Date']).dt.year
    word_counts_by_year = word_trend_data.groupby('Year').size().reset_index(name='Count')

    # Create a line chart
    figure = px.line(word_counts_by_year, x='Year', y='Count', title=f"Trend of the Word '{selected_word}' over Time")
    
    return figure

NameError: name 'dash' is not defined

In [80]:
# Run the dahsboard execution at the end [for debugging purposes]

# app.layout = html.Div([
#     html.H1("AI Perception Analysis Dashboard"),
#     dcc.Graph(figure=positive_word_cloud),
#     dcc.Graph(figure=negative_word_cloud),
#     dcc.Graph(figure=positive_bar_chart),
#     dcc.Graph(figure=negative_bar_chart),
#     dcc.Graph(figure=health_bar_chart),
#     html.Div([
#         html.H3("Select a Word for Trend Analysis:"),
#         word_dropdown,
#         dcc.Graph(id='word-trend-chart'),
#         # Add other components as needed
#     ])
# ])

# if __name__ == '__main__':
#     app.run_server(debug=True)

## Analyzing Positive or Negative Connotations of a Selected Word

We create two bar charts to represent the positive and negative associations of a selected word over time.

In [81]:
# # Function to update positive and negative connotations charts
# @app.callback(
#     [Output('positive-connotation-chart', 'figure'),
#      Output('negative-connotation-chart', 'figure')],
#     Input('word-dropdown', 'value')
# )
# def update_connotation_charts(selected_word):
#     # Filter data by the selected word
#     connotation_data = data[data['Processed_Paragraph'].apply(lambda x: selected_word in x)]

#     # Extract years
#     connotation_data['Year'] = pd.to_datetime(connotation_data['Article Date']).dt.year

#     # Calculate positive and negative scores by year
#     positive_scores_by_year = connotation_data.groupby('Year')[positive_columns].sum().sum(axis=1).reset_index(name='Positive Score')
#     negative_scores_by_year = connotation_data.groupby('Year')[negative_columns].sum().sum(axis=1).reset_index(name='Negative Score')

#     # Create bar charts
#     positive_figure = px.bar(positive_scores_by_year, x='Year', y='Positive Score', title=f"Positive Connotations of the Word '{selected_word}' over Time")
#     negative_figure = px.bar(negative_scores_by_year, x='Year', y='Negative Score', title=f"Negative Connotations of the Word '{selected_word}' over Time")
    
#     return positive_figure, negative_figure


In [82]:
# app.layout = html.Div([
#     # ... existing components
#     html.Div([
#         html.H3("Select a Word for Trend and Connotation Analysis:"),
#         word_dropdown,
#         dcc.Graph(id='word-trend-chart'),
#         dcc.Graph(id='positive-connotation-chart'),
#         dcc.Graph(id='negative-connotation-chart'),
#         # Add other components as needed
#     ])
# ])


In [83]:
# # Function to update word frequency chart for a specific year
# @app.callback(
#     Output('word-frequency-chart', 'figure'),
#     [Input('word-dropdown', 'value'),
#      Input('year-input', 'value')]  # Assuming a numeric input component for the year
# )
# def update_word_frequency_chart(selected_word, selected_year):
#     # Filter data by the selected word and year
#     frequency_data = data[data['Processed_Paragraph'].apply(lambda x: selected_word in x)]
#     frequency_data['Year'] = pd.to_datetime(frequency_data['Article Date']).dt.year
#     frequency_data = frequency_data[frequency_data['Year'] == selected_year]

#     # Count occurrences
#     word_count = len(frequency_data)

#     # Create a bar chart
#     figure = go.Figure(data=[go.Bar(x=[selected_word], y=[word_count])])
#     figure.update_layout(title=f"Frequency of the Word '{selected_word}' in {selected_year}")
    
#     return figure


### Analyzing Positive or Negative Connotations of a Selected Word

We'll create a pie chart that shows whether the selected word is used mostly in positive or negative connotations.

In [84]:
# Function to update pie chart based on selected word's positive/negative association
@app.callback(
    Output('word-association-pie', 'figure'),
    Input('word-dropdown', 'value')
)
def update_word_association(selected_word):
    positive_count = positive_word_counts[selected_word]
    negative_count = negative_word_counts[selected_word]
    association_df = pd.DataFrame({'Association': ['Positive', 'Negative'], 'Count': [positive_count, negative_count]})
    figure = px.pie(association_df, names='Association', values='Count', title=f"Positive/Negative Associations of '{selected_word}'")
    return figure

### Yearly Frequency Analysis for a Specific Year (e.g., 2016)

We'll create a bar chart that shows the frequency of the selected word in a specific year.

In [85]:
# Function to update bar chart for the frequency of the selected word in 2016
@app.callback(
    Output('word-frequency-year', 'figure'),
    Input('word-dropdown', 'value')
)
def update_word_frequency(selected_word):
    word_data_2016 = data[(data['Processed_Paragraph'].apply(lambda x: selected_word in x)) & (pd.to_datetime(data['Article Date']).dt.year == 2016)]
    word_count_2016 = len(word_data_2016)
    frequency_df = pd.DataFrame({'Word': [selected_word], 'Frequency': [word_count_2016]})
    figure = px.bar(frequency_df, x='Word', y='Frequency', title=f"Frequency of '{selected_word}' in 2016")
    return figure

### Extra Details for the Dashboard Layout

We'll integrate all the visualizations and interactive components to create the final dashboard layout.

In [86]:
app.layout = html.Div([
    html.H1("AI Perception Analysis Dashboard"),
    dcc.Graph(figure=positive_word_cloud),
    dcc.Graph(figure=negative_word_cloud),
    dcc.Graph(figure=positive_bar_chart),
    dcc.Graph(figure=negative_bar_chart),
    dcc.Graph(figure=health_bar_chart),
    html.Div([
        html.H3("Select a Word for Analysis:"),
        word_dropdown,
        dcc.Graph(id='word-trend-chart'),
        dcc.Graph(id='word-association-pie'),
        dcc.Graph(id='word-frequency-year'),
    ])
])

if __name__ == '__main__':
    app.run_server(debug=True)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/