In [85]:
import pandas as pd 
from collections import Counter
import string
# Importing nltk resources for text preprocessing
import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
import re
from spellchecker import SpellChecker

# Data Loading

In [202]:
file_path = "data/ai-perception-concat-paragraphs.csv"
data = pd.read_csv(file_path)

# display first 5 rows of data
data.head()

Unnamed: 0,Article ID,Article Date,NYT section,Paragraph,Title,Fiction,Cyborg (positive),Decisions (positive),Education (positive),Entertain (positive),...,Cyborg (negative),Ethics (negative),Military (negative),Progress (negative),Singularity (negative),Work (negative),AI Mood,AI Relevance,Other (negative),Other (positive)
0,4fd100e58eb7c8105d5bbb33,2012-04-01 00:00:00 UTC,Arts; Style; Magazine,9. The robot designs woven into these tea towe...,Stijl Council,0,0,0,0,0,...,0,0,0,0,0,0,3.0,2.333333,,
1,4fd100e88eb7c8105d5bbd2d,,Science; Health,"In 1818, Mary Shelley's ''Frankenstein'' raise...",Statues to Golems to R2-D2,2,0,0,0,0,...,0,1,2,0,1,0,2.333333,5.0,,
2,4fd14a668eb7c8105d627c40,,Business; Opinion,Mr. Culbertson would use ''powerful new tools'...,ECONOMIC POLICY,0,0,2,0,0,...,0,0,0,0,0,0,4.0,4.333333,,manage entire economies AI and computers could...
3,4fd14a678eb7c8105d627d24,1986-02-16 00:00:00 UTC,Business,"Some golfer. Fortunately, you'll never have to...",IN PURSUIT OF THE PERFECT GOLF BALL,0,0,0,0,0,...,0,0,0,0,0,0,3.0,4.0,,
4,4fd14a678eb7c8105d627d35,1986-02-16 00:00:00 UTC,U.S.,All week long Navy divers and salvage experts ...,LATEST PICTURES FROM NASA SHOW WODER FIRE ON B...,0,0,0,0,0,...,0,0,0,0,0,0,3.0,3.333333,,


In [206]:
print("Rows:", data.shape[0])
# Count NaNs in 'Article Date' column
print("Number of NaNs in 'Article Date' column: ", data['Article Date'].isnull().sum(), f"({round(data['Article Date'].isnull().sum()/data.shape[0]*100, 2)}%)")

Rows: 3365
Number of NaNs in 'Article Date' column:  1418 (42.14%)


In [207]:
# change 'Article Date' in datetime64[ns, UTC] format and change to 'Article Year' in int64 format
# Convert the 'Article Date' column to datetime, coercing errors
# data['Article Date'] = pd.to_datetime(data['Article Date'], errors='coerce')
# data['Article Year'] = data['Article Date'].dt.year
# # convert to int64 - ignore nulls
# data['Article Year'] = data['Article Year'].fillna(0)
# data['Article Year'] = data['Article Year'].astype('int64')

# data.head()

# Text Preprocessing

### Define Stop Words

In [208]:
# Define common English stop words - set class data type
stop_words = {'thus', 'the', 'and', 'to', 'of', 'a', 'in', 'that', 'is', 'was', 'he', 'for', 'it', 'with', 'as', 'his', 'on', 'be', 'at', 'by', 'i', 'this', 'had', 'not', 'but', 'from', 'or', 'have', 'an', 'they', 'which', 'you', 'were', 'her', 'their', 'we', 'its', 'said', 'like', '—', '——', '-', '--', '  '}

# load frequent words dataset - columns are 'word' and 'count'
freq_words = pd.read_csv('data/unigram_freq.csv')

# add 1 and 2 letter words to the stop_words set if they are not already there
for i in range(len(freq_words['word'])):
    word = str(freq_words['word'][i])
    if len(word) <= 2:
        stop_words.add(word)

# add stop words with count more than 550,000,000
for i in range(len(freq_words['word'])):
    word = str(freq_words['word'][i])
    if freq_words['count'][i] > 550000000:
        stop_words.add(word)

# create pandas series with words with count less than 100,000
less_freq_words = freq_words[freq_words['count'] < 100000]['word'].tail(100000).tolist()
# add less_freq_words to stop_words set
stop_words = stop_words.union(set(less_freq_words))

In [209]:
# freq_words[freq_words['count'] > 550000000].shape
freq_words[freq_words['count'] < 100000].shape

(233846, 2)

In [210]:
print(len(stop_words))

100775


### Tokenize Text

In [211]:
spell = SpellChecker()

# Function to tokenize text (split by spaces and remove punctuation)
def custom_tokenize(text):
    # Convert to lower case
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # write 'artificial intelligence' phrase as single hyphenated word 'artificial_intelligence'
    text = text.replace("artificial intelligence", "artificial_intelligence")
    text = text.replace(" ai ", " AI ")
    # Remove numbers and special characters but keep hyphenated words
    text = re.sub(r'[^a-zA-Z0-9-]', ' ', text)
    # Remove numbers
    # text = re.sub(r'[0-9]', ' ', text)
    # Tokenize by spaces
    tokens = text.split()
    # Include only correctly spelled words
    tokens = [word for word in tokens if word in spell]

    return tokens

# Updated function to preprocess text (tokenize and remove stop words) with handling for missing or non-string values
def custom_preprocess_text(text):
    if pd.isnull(text) or not isinstance(text, str):
        return []
    
    # Tokenize the text
    tokens = custom_tokenize(text)
    # Remove stop words
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

In [212]:
# Apply the custom preprocessing function to the 'Paragraph' column
data['Processed_Paragraph'] = data['Paragraph'].apply(custom_preprocess_text)

# Preview the processed paragraphs of the first few rows
data[['Paragraph', 'Processed_Paragraph']].head()

Unnamed: 0,Paragraph,Processed_Paragraph
0,9. The robot designs woven into these tea towe...,"[robot, designs, woven, into, these, tea, towe..."
1,"In 1818, Mary Shelley's ''Frankenstein'' raise...","[mary, frankenstein, raised, specter, machines..."
2,Mr. Culbertson would use ''powerful new tools'...,"[powerful, tools, model, manage, entire, econo..."
3,"Some golfer. Fortunately, you'll never have to...","[some, golfer, fortunately, youll, never, meet..."
4,All week long Navy divers and salvage experts ...,"[week, long, navy, divers, salvage, experts, u..."


## Task 1: Word Associations (Positive and Negative)

To find words with the most positive and negative associations, we analyze the processed paragraphs in conjunction with the positive and negative columns in the dataset. We create functions to aggregate the word counts based on these associations.

In [213]:
# Define columns representing positive and negative associations
positive_columns = ['Cyborg (positive)', 'Decisions (positive)', 'Education (positive)', 'Entertain (positive)', 'Healthcare (positive)', 'Singularity (positive)', 'Transportation (positive)', 'Work (positive)']
negative_columns = ['Controling AI (negative)', 'Cyborg (negative)', 'Ethics (negative)', 'Military (negative)', 'Progress (negative)', 'Singularity (negative)', 'Work (negative)']
# Define remaining columns
all_columns = data.columns.tolist()
remaining_columns = [col for col in all_columns if col not in positive_columns and col not in negative_columns]

# Function to count words with positive and negative associations
def count_associations(row):
    # Extract processed paragraph
    words = row['Processed_Paragraph']
    # Initialize counters for positive and negative associations
    positive_counter = Counter()
    negative_counter = Counter()
    # Iterate through words and update counters based on associations in the row
    for word in words:
        if any(row[col] > 0 for col in positive_columns):
            positive_counter[word] += 1
        if any(row[col] > 0 for col in negative_columns):
            negative_counter[word] += 1
    return positive_counter, negative_counter

In [214]:
# Apply the function to count word associations
positive_word_counts = Counter()
negative_word_counts = Counter()
# Create word count dataframe with 'Article ID', 'Article Date', 'NYT Section', 'Title', 'AI Mood', 'AI Relevance', '
for _, row in data.iterrows():
    positive, negative = count_associations(row)
    positive_word_counts += positive
    negative_word_counts += negative

### Validity Checks

In [216]:
# print length of positive_word_counts and negative_word_counts dictionaries
print('positive word count:', len(positive_word_counts))
print('negative word count:', len(negative_word_counts))

positive word count: 14170
negative word count: 10235


In [217]:
# check if 'robot' is contained in negative_word_counts
print('robot in positive words:', 'robot' in positive_word_counts, '\nrobot in negative words:', 'robot' in negative_word_counts)

print('robot (positive):', positive_word_counts['robot'])
print('robot (negative):', negative_word_counts['robot'])

robot in positive words: True 
robot in negative words: True
robot (positive): 2149
robot (negative): 785


Check top positively and negatively associated words

In [218]:
# Get top 10 positive and negative words
top_positive_words = positive_word_counts.most_common(10)
top_negative_words = negative_word_counts.most_common(10)

# top_positive_words, top_negative_words
print('top positive:', top_positive_words, '\ntop negative:', top_negative_words)

top positive: [('robot', 2149), ('intelligence', 1954), ('artificial', 1871), ('computer', 647), ('human', 597), ('technology', 562), ('robots', 522), ('AI', 490), ('could', 484), ('than', 460)] 
top negative: [('intelligence', 1278), ('artificial', 1187), ('robot', 785), ('human', 426), ('computer', 388), ('AI', 365), ('technology', 351), ('robots', 300), ('people', 271), ('could', 261)]


### Pivot Sentiment Columns into Rows

In [243]:
# Pivot the sentiment columns into rows creating a new row for each sentiment that has a non-zero entry
data_melt = pd.melt(data, id_vars=remaining_columns, value_vars=positive_columns + negative_columns, var_name='Sentiment_Type', value_name='Sentiment_Value')

# drop zero entries from 'Value' column
data_melt_nonzero = data_melt[data_melt['Sentiment_Value'] != 0]
# Preview pivotted nonzero table
data_melt_nonzero.head()

Unnamed: 0,Article ID,Article Date,NYT section,Paragraph,Title,Fiction,AI Mood,AI Relevance,Other (negative),Other (positive),Processed_Paragraph,Sentiment_Type,Sentiment_Value
16,4fd156a18eb7c8105d63aed2,1986-08-10 00:00:00 UTC,U.S.,The robot must be keyed to a single individual...,"TO ASSIST HANDICAPPED, A ROBOT THAT CAN HEAR",0,4.333333,5.0,,could be the tool that a handicapped person ...,"[robot, must, keyed, single, individuals, voic...",Cyborg (positive),2
115,4fd16d848eb7c8105d660007,,U.S.,"The F.B.I. is enthusiastic about Big Floyd, wh...",'BIG FLOYD' JOINS THE FORCE,0,3.666667,5.0,,Criminal Investigation Lesser Crimes diffe...,"[fbi, enthusiastic, big, floyd, whose, namesak...",Cyborg (positive),1
125,4fd1707b8eb7c8105d66317b,1986-08-10 00:00:00 UTC,Arts,"''Condor,'' on the other hand, tries to be amu...",WHEN THE SLUSH PILE COMES TO LIFE,1,3.0,3.333333,,,"[condor, hand, tries, amusing, after, hero, se...",Cyborg (positive),1
162,4fd1781e8eb7c8105d66ef93,,Business,"''The Tomorrow Makers,'' by Grant Fjermedal ($...",HOW TO AVOID TUNNEL VISION,0,3.111111,4.666667,AI minds are wildly different from hum...,robotic immortality,"[tomorrow, makers, grant, macmillan, volume, d...",Cyborg (positive),2
275,4fd190318eb7c8105d696da7,,Technology; Science; Week in Review,WHEN the computer scientist John McCarthy coin...,IDEAS AND TRENDS: Can Machines Learn to Think?...,0,2.888889,4.555556,AI companies failing,science and technology development suppo...,"[computer, scientist, john, mccarthy, coined, ...",Cyborg (positive),1


In [220]:
data_melt_nonzero['Sentiment_Value'].value_counts().tail(10)

Sentiment_Value
12    5
9     4
10    4
13    3
16    2
15    2
19    1
11    1
18    1
14    1
Name: count, dtype: int64

In [221]:
# The article with a sentiment 'Value' of 14 is heavily associated with 'Military (negative)'
data_melt_nonzero[data_melt_nonzero['Sentiment_Value'] == 14]

Unnamed: 0,Article ID,Article Date,NYT section,Paragraph,Title,Fiction,AI Mood,AI Relevance,Other (negative),Other (positive),Processed_Paragraph,Sentiment_Type,Sentiment_Value
39653,54626c6e79881072f4f730ac,,Science,Warfare is increasingly guided by software. To...,Fearing Bombs That Can Pick Whom to Kill,0,2.444444,4.833333,,warfare,"[warfare, increasingly, guided, software, toda...",Military (negative),14


In [222]:

# Create word count dataframe with 'Article ID', 'Article Date', 'Word', 'Count', 'NYT Section', 'Title', 'AI Mood', 'AI Relevance', 'Sentiment (Positive/Negative)', 'Value'
# word_count_df = pd.DataFrame(columns=['Article ID', 'Article Date', 'Word', 'Count', 'NYT Section', 'Title', 'AI Mood', 'AI Relevance', 'Sentiment (Positive/Negative)', 'Value'])


# expand 'Processed_Paragraph' column list entries into a row for each word
data_word_expand = data_melt_nonzero.explode('Processed_Paragraph')
# reset index
data_word_expand.reset_index(drop=True, inplace=True)
# rename 'Processed_Paragraph' column to 'Word'
data_word_expand.rename(columns={'Processed_Paragraph': 'Word'}, inplace=True)
# drop rows with NaN values
# data_word_expand.dropna(inplace=True)
# add count value from positive_word_counts and negative_word_counts to data_word_expand dataframe for each word and sentiment type
data_word_expand['Count'] = data_word_expand.apply(lambda row: positive_word_counts[row['Word']] if row['Sentiment_Type'] in positive_columns else negative_word_counts[row['Word']], axis=1)
# reset index
data_word_expand.reset_index(drop=True, inplace=True)

In [223]:
data_word_expand.shape

(455138, 14)

In [224]:
# print columns of data_word_expand in single line with '' around col name
print(' '.join([f"'{col}'," for col in data_word_expand.columns.tolist()]))

'Article ID', 'Article Date', 'NYT section', 'Paragraph', 'Title', 'Fiction', 'AI Mood', 'AI Relevance', 'Other (negative)', 'Other (positive)', 'Word', 'Sentiment_Type', 'Sentiment_Value', 'Count',


In [226]:
# drop unnecessary columns in data_word_expand - only keep 'Article ID', 'Word' 'Sentiment_Type', 'AI Mood', 'Sentiment_Value', 'AI Relevance', 'Count'
data_word_expand_opt = data_word_expand.drop(['Article Date', 'NYT section', 'Paragraph', 'Title', 'Fiction', 'Other (negative)', 'Other (positive)'], axis=1) # 'Article Year'

data_word_expand_opt.head()

Unnamed: 0,Article ID,AI Mood,AI Relevance,Word,Sentiment_Type,Sentiment_Value,Count
0,4fd156a18eb7c8105d63aed2,4.333333,5.0,robot,Cyborg (positive),2,2149
1,4fd156a18eb7c8105d63aed2,4.333333,5.0,must,Cyborg (positive),2,65
2,4fd156a18eb7c8105d63aed2,4.333333,5.0,keyed,Cyborg (positive),2,1
3,4fd156a18eb7c8105d63aed2,4.333333,5.0,single,Cyborg (positive),2,25
4,4fd156a18eb7c8105d63aed2,4.333333,5.0,individuals,Cyborg (positive),2,9


In [227]:
# Create new df that keeps just the 'positive' or 'negative' within the parenthesis of the 'Sentiment_Type' column
data_word_pos_neg = data_word_expand_opt.copy()
data_word_pos_neg['Sentiment_Type'] = data_word_pos_neg['Sentiment_Type'].str.extract(r'\((.*?)\)')

data_word_pos_neg.head()

Unnamed: 0,Article ID,AI Mood,AI Relevance,Word,Sentiment_Type,Sentiment_Value,Count
0,4fd156a18eb7c8105d63aed2,4.333333,5.0,robot,positive,2,2149
1,4fd156a18eb7c8105d63aed2,4.333333,5.0,must,positive,2,65
2,4fd156a18eb7c8105d63aed2,4.333333,5.0,keyed,positive,2,1
3,4fd156a18eb7c8105d63aed2,4.333333,5.0,single,positive,2,25
4,4fd156a18eb7c8105d63aed2,4.333333,5.0,individuals,positive,2,9


In [228]:
data_word_pos_neg[data_word_pos_neg['Word'] == 'AI']

Unnamed: 0,Article ID,AI Mood,AI Relevance,Word,Sentiment_Type,Sentiment_Value,Count
3905,4fd24ceb8eb7c8105d7ece3b,3.250000,4.638889,AI,positive,1,490
3913,4fd24ceb8eb7c8105d7ece3b,3.250000,4.638889,AI,positive,1,490
3972,4fd24ceb8eb7c8105d7ece3b,3.250000,4.638889,AI,positive,1,490
4041,4fd24ceb8eb7c8105d7ece3b,3.250000,4.638889,AI,positive,1,490
4230,4fd24d6b8eb7c8105d7ee41d,3.533333,4.533333,AI,positive,1,490
...,...,...,...,...,...,...,...
454650,568a04687988103a8dd337e6,2.666667,4.750000,AI,negative,13,365
454679,568a04687988103a8dd337e6,2.666667,4.750000,AI,negative,13,365
454869,5718b2ba7988102ed807b6e4,2.666667,4.400000,AI,negative,1,365
455018,5718b2ba7988102ed807b6e4,2.666667,4.400000,AI,negative,1,365


In [229]:
# check rows with 'Word' == 'robot' and sort by 'Article ID' 
data_word_pos_neg[data_word_pos_neg['Word'] == 'robot'].sort_values(by=['Article ID']).head(8)

Unnamed: 0,Article ID,AI Mood,AI Relevance,Word,Sentiment_Type,Sentiment_Value,Count
215208,4fd14a678eb7c8105d627d35,3.0,3.333333,robot,positive,1,2149
172992,4fd14a688eb7c8105d627e17,4.0,3.666667,robot,positive,1,2149
239943,4fd14a688eb7c8105d627e17,4.0,3.666667,robot,positive,1,2149
215246,4fd14a688eb7c8105d627e17,4.0,3.666667,robot,positive,1,2149
215336,4fd155958eb7c8105d639695,3.111111,3.444444,robot,positive,2,2149
215273,4fd155958eb7c8105d639695,3.111111,3.444444,robot,positive,2,2149
215307,4fd155958eb7c8105d639695,3.111111,3.444444,robot,positive,2,2149
27451,4fd1559c8eb7c8105d639bc6,3.333333,4.666667,robot,positive,1,2149


In [230]:
# aggregate data_word_pos_neg by 'Article ID', 'Word', and 'Sentiment_Type' taking the first value of 'AI Mood', 'AI Relevance' and 'Count' and summing 'Sentiment_Value'
data_word_pos_neg_agg = data_word_pos_neg.groupby(['Article ID', 'Word', 'Sentiment_Type']).agg({'AI Mood': 'first', 'AI Relevance': 'first', 'Count': 'first', 'Sentiment_Value': 'sum'}).reset_index()

data_word_pos_neg_agg.head()

Unnamed: 0,Article ID,Word,Sentiment_Type,AI Mood,AI Relevance,Count,Sentiment_Value
0,4fd100e88eb7c8105d5bbd2d,aircraft,negative,2.333333,5.0,16,5
1,4fd100e88eb7c8105d5bbd2d,aircraft,positive,2.333333,5.0,20,1
2,4fd100e88eb7c8105d5bbd2d,amok,negative,2.333333,5.0,5,5
3,4fd100e88eb7c8105d5bbd2d,amok,positive,2.333333,5.0,3,1
4,4fd100e88eb7c8105d5bbd2d,artificial,negative,2.333333,5.0,1187,5


In [231]:
# print each unique value of 'Word' in data_word_pos_neg_agg
print(len(data_word_pos_neg_agg['Word'].unique()))
print(data_word_pos_neg_agg['Word'].unique())

15195
['aircraft' 'amok' 'artificial' ... 'commissioner' 'fiduciary' 'kara']


In [232]:
print(data_word_pos_neg_agg.shape)
data_word_pos_neg_agg[data_word_pos_neg_agg['Word'] == 'robot'].sort_values(by=['Article ID']).head(8)

(137722, 7)


Unnamed: 0,Article ID,Word,Sentiment_Type,AI Mood,AI Relevance,Count,Sentiment_Value
131,4fd14a678eb7c8105d627d35,robot,positive,3.0,3.333333,2149,1
167,4fd14a688eb7c8105d627e17,robot,positive,4.0,3.666667,2149,3
324,4fd155958eb7c8105d639695,robot,positive,3.111111,3.444444,2149,6
367,4fd1559c8eb7c8105d639bc6,robot,positive,3.333333,4.666667,2149,2
433,4fd155f98eb7c8105d639daa,robot,negative,3.666667,4.333333,785,2
434,4fd155f98eb7c8105d639daa,robot,positive,3.666667,4.333333,2149,1
485,4fd156548eb7c8105d63aa5b,robot,negative,2.666667,4.166667,785,1
515,4fd156558eb7c8105d63ab15,robot,positive,3.333333,3.333333,2149,1


In [247]:
# Preview words that only occur once
data_word_pos_neg_agg[data_word_pos_neg_agg['Count'] == 1]

Unnamed: 0,Article ID,Word,Sentiment_Type,AI Mood,AI Relevance,Count,Sentiment_Value
118,4fd14a678eb7c8105d627d35,challengers,positive,3.000000,3.333333,1,1
121,4fd14a678eb7c8105d627d35,exploded,positive,3.000000,3.333333,1,1
122,4fd14a678eb7c8105d627d35,hurled,positive,3.000000,3.333333,1,1
198,4fd14a688eb7c8105d627e36,dime,positive,3.190476,3.809524,1,4
225,4fd14a688eb7c8105d627e36,majors,positive,3.190476,3.809524,1,4
...,...,...,...,...,...,...,...
137700,572350e37988101b346ef18a,fiduciary,negative,2.666667,4.000000,1,1
137701,572350e37988101b346ef18a,fiduciary,positive,2.666667,4.000000,1,2
137708,572350e37988101b346ef18a,kara,negative,2.666667,4.000000,1,1
137709,572350e37988101b346ef18a,kara,positive,2.666667,4.000000,1,2


# Output Formatted Data

In [None]:
import pantab

In [244]:
# remove ' UTC' from 'Article Date' column in data_melt_nonzero
data_melt_nonzero.loc[:, 'Article Date'] = data_melt_nonzero['Article Date'].str.replace(' UTC', '')

data_melt_nonzero.head()

Unnamed: 0,Article ID,Article Date,NYT section,Paragraph,Title,Fiction,AI Mood,AI Relevance,Other (negative),Other (positive),Processed_Paragraph,Sentiment_Type,Sentiment_Value
16,4fd156a18eb7c8105d63aed2,1986-08-10 00:00:00,U.S.,The robot must be keyed to a single individual...,"TO ASSIST HANDICAPPED, A ROBOT THAT CAN HEAR",0,4.333333,5.0,,could be the tool that a handicapped person ...,"[robot, must, keyed, single, individuals, voic...",Cyborg (positive),2
115,4fd16d848eb7c8105d660007,,U.S.,"The F.B.I. is enthusiastic about Big Floyd, wh...",'BIG FLOYD' JOINS THE FORCE,0,3.666667,5.0,,Criminal Investigation Lesser Crimes diffe...,"[fbi, enthusiastic, big, floyd, whose, namesak...",Cyborg (positive),1
125,4fd1707b8eb7c8105d66317b,1986-08-10 00:00:00,Arts,"''Condor,'' on the other hand, tries to be amu...",WHEN THE SLUSH PILE COMES TO LIFE,1,3.0,3.333333,,,"[condor, hand, tries, amusing, after, hero, se...",Cyborg (positive),1
162,4fd1781e8eb7c8105d66ef93,,Business,"''The Tomorrow Makers,'' by Grant Fjermedal ($...",HOW TO AVOID TUNNEL VISION,0,3.111111,4.666667,AI minds are wildly different from hum...,robotic immortality,"[tomorrow, makers, grant, macmillan, volume, d...",Cyborg (positive),2
275,4fd190318eb7c8105d696da7,,Technology; Science; Week in Review,WHEN the computer scientist John McCarthy coin...,IDEAS AND TRENDS: Can Machines Learn to Think?...,0,2.888889,4.555556,AI companies failing,science and technology development suppo...,"[computer, scientist, john, mccarthy, coined, ...",Cyborg (positive),1


In [248]:
# output to 'data/ai_concat_pivot.csv'
data_melt_nonzero_opt = data_melt_nonzero.drop(['Processed_Paragraph'], axis=1)
data_melt_nonzero_opt.to_csv('data/ai_concat_pivot.csv', index=False)
# output to .hyper file
pantab.frame_to_hyper(data_melt_nonzero_opt, 'data/ai_concat_pivot.hyper', table='ai_concat_pivot')

In [None]:
# save to CSV
data_word_pos_neg_agg.to_csv('data/data_concat_words_pos_neg.csv', index=False)
# save as .hyper file
pantab.frame_to_hyper(data_word_pos_neg_agg, 'data/data_concat_words_pos_neg.hyper', table='data_concat_words_pos_neg')

In [None]:
# data_word_expand_opt.to_csv('data/data_concat_word_count.csv')