In [21]:
import pandas as pd
import os
import datetime
import pickle
import re
import time

## 1. Import Meta Data

In [22]:
# All news metadata
allNews=pd.read_pickle('/home/ec2-user/SageMaker/New Uncertainty/parsed_xml.pkl')
print(allNews.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 822737 entries, 0 to 822736
Data columns (total 12 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   ID             822737 non-null  object        
 1   Title          822737 non-null  object        
 2   Type           822737 non-null  object        
 3   StartDate      822737 non-null  datetime64[ns]
 4   EndDate        822737 non-null  object        
 5   Text           822737 non-null  object        
 6   TextWordCount  822737 non-null  object        
 7   PubTitle       822737 non-null  object        
 8   SourceType     822737 non-null  object        
 9   Year           822737 non-null  int64         
 10  Month          822737 non-null  int64         
 11  Newspaper      822737 non-null  object        
dtypes: datetime64[ns](1), int64(2), object(9)
memory usage: 75.3+ MB
None


In [23]:
# Reg relevant articles
df_regSentsExpand=pd.read_pickle('/home/ec2-user/SageMaker/New Uncertainty/Reg Relevance/RegSentsExpand_NounChunks3.pkl')
print(df_regSentsExpand.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 822737 entries, 0 to 822736
Data columns (total 6 columns):
 #   Column                       Non-Null Count   Dtype 
---  ------                       --------------   ----- 
 0   ID                           822737 non-null  object
 1   RegSentsExpand               822737 non-null  object
 2   NounChunksMatch              822737 non-null  int64 
 3   NounChunkMatchWords          822737 non-null  object
 4   NounChunkMatchWordsFiltered  822737 non-null  object
 5   NounChunkMatchFiltered       822737 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 37.7+ MB
None


In [24]:
# Merge with metadata
df=df_regSentsExpand.merge(allNews[['ID','Title','StartDate','Text','Newspaper','Year','Month','Type','SourceType']],
                           on='ID',how='left')
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 822737 entries, 0 to 822736
Data columns (total 14 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   ID                           822737 non-null  object        
 1   RegSentsExpand               822737 non-null  object        
 2   NounChunksMatch              822737 non-null  int64         
 3   NounChunkMatchWords          822737 non-null  object        
 4   NounChunkMatchWordsFiltered  822737 non-null  object        
 5   NounChunkMatchFiltered       822737 non-null  int64         
 6   Title                        822737 non-null  object        
 7   StartDate                    822737 non-null  datetime64[ns]
 8   Text                         822737 non-null  object        
 9   Newspaper                    822737 non-null  object        
 10  Year                         822737 non-null  int64         
 11  Month                     

In [25]:
# Remove duplicated articles
IDs_nodup=pd.read_csv('/home/ec2-user/SageMaker/New Uncertainty/IDs_no_duplicates.csv')
print(IDs_nodup.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 788516 entries, 0 to 788515
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   ID      788516 non-null  int64
dtypes: int64(1)
memory usage: 6.0 MB
None


In [26]:
#df['ID']=df['ID'].astype('int64')
IDs_nodup['ID']=IDs_nodup['ID'].astype('str')
df=IDs_nodup.merge(df,on='ID',how='left').reset_index(drop=True)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 788516 entries, 0 to 788515
Data columns (total 14 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   ID                           788516 non-null  object        
 1   RegSentsExpand               788516 non-null  object        
 2   NounChunksMatch              788516 non-null  int64         
 3   NounChunkMatchWords          788516 non-null  object        
 4   NounChunkMatchWordsFiltered  788516 non-null  object        
 5   NounChunkMatchFiltered       788516 non-null  int64         
 6   Title                        788516 non-null  object        
 7   StartDate                    788516 non-null  datetime64[ns]
 8   Text                         788516 non-null  object        
 9   Newspaper                    788516 non-null  object        
 10  Year                         788516 non-null  int64         
 11  Month                     

In [27]:
# Use filtered noun chunk matches to define reg relevance
df.loc[df['NounChunkMatchFiltered']!=0,'RegRelevance']=1
print("# of reg relevant articles:",df[df['RegRelevance']==1]['ID'].nunique())

# of reg relevant articles: 493418


## 2. Calculate Sentiment Scores

In [28]:
# LM uncertainty scores
LMuncertainty=pd.read_csv('/home/ec2-user/SageMaker/New Uncertainty/Sentiment Analysis/LMuncertainty.csv')
print(LMuncertainty.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 502909 entries, 0 to 502908
Data columns (total 3 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   ID                502909 non-null  int64 
 1   UncertaintyCount  502909 non-null  int64 
 2   UncertaintyWords  502909 non-null  object
dtypes: int64(2), object(1)
memory usage: 11.5+ MB
None


In [29]:
LMuncertainty['ID']=LMuncertainty['ID'].astype('str')
print(LMuncertainty.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 502909 entries, 0 to 502908
Data columns (total 3 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   ID                502909 non-null  object
 1   UncertaintyCount  502909 non-null  int64 
 2   UncertaintyWords  502909 non-null  object
dtypes: int64(1), object(2)
memory usage: 11.5+ MB
None


In [30]:
print(type(LMuncertainty['ID'][0]))

<class 'str'>


In [31]:
# Merge
df2=df.merge(LMuncertainty,on='ID',how='left')
print(df2.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 788516 entries, 0 to 788515
Data columns (total 17 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   ID                           788516 non-null  object        
 1   RegSentsExpand               788516 non-null  object        
 2   NounChunksMatch              788516 non-null  int64         
 3   NounChunkMatchWords          788516 non-null  object        
 4   NounChunkMatchWordsFiltered  788516 non-null  object        
 5   NounChunkMatchFiltered       788516 non-null  int64         
 6   Title                        788516 non-null  object        
 7   StartDate                    788516 non-null  datetime64[ns]
 8   Text                         788516 non-null  object        
 9   Newspaper                    788516 non-null  object        
 10  Year                         788516 non-null  int64         
 11  Month                     

In [32]:
# GI sentiments
GIsentiments=pd.read_csv('/home/ec2-user/SageMaker/New Uncertainty/Sentiment Analysis/GIsentiments.csv')
GIsentiments['ID']=GIsentiments['ID'].astype('str')
print(GIsentiments.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 502909 entries, 0 to 502908
Data columns (total 6 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   ID              502909 non-null  object
 1   TotalWordCount  502909 non-null  int64 
 2   GIposCount      502909 non-null  int64 
 3   GInegCount      502909 non-null  int64 
 4   GIposWords      502909 non-null  object
 5   GInegWords      502909 non-null  object
dtypes: int64(3), object(3)
memory usage: 23.0+ MB
None


In [33]:
# LSD sentiments
LSDsentiments=pd.read_csv('/home/ec2-user/SageMaker/New Uncertainty/Sentiment Analysis/LSDsentiments.csv')
LSDsentiments['ID']=LSDsentiments['ID'].astype('str')
print(LSDsentiments.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 502909 entries, 0 to 502908
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   ID           502909 non-null  object
 1   LSDposCount  502909 non-null  int64 
 2   LSDnegCount  502909 non-null  int64 
 3   LSDposWords  502909 non-null  object
 4   LSDnegWords  502909 non-null  object
dtypes: int64(2), object(3)
memory usage: 19.2+ MB
None


In [34]:
# LM sentiments
LMsentiments=pd.read_csv('/home/ec2-user/SageMaker/New Uncertainty/Sentiment Analysis/LMsentiments.csv')
LMsentiments['ID']=LMsentiments['ID'].astype('str')
print(LMsentiments.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 502909 entries, 0 to 502908
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   ID          502909 non-null  object
 1   LMposCount  502909 non-null  int64 
 2   LMnegCount  502909 non-null  int64 
 3   LMposWords  502909 non-null  object
 4   LMnegWords  502909 non-null  object
dtypes: int64(2), object(3)
memory usage: 19.2+ MB
None


In [35]:
# Merge
df3=df2.merge(GIsentiments,on='ID',how='left').\
    merge(LMsentiments,on='ID',how='left').\
    merge(LSDsentiments,on='ID',how='left')
print(df3.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 788516 entries, 0 to 788515
Data columns (total 30 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   ID                           788516 non-null  object        
 1   RegSentsExpand               788516 non-null  object        
 2   NounChunksMatch              788516 non-null  int64         
 3   NounChunkMatchWords          788516 non-null  object        
 4   NounChunkMatchWordsFiltered  788516 non-null  object        
 5   NounChunkMatchFiltered       788516 non-null  int64         
 6   Title                        788516 non-null  object        
 7   StartDate                    788516 non-null  datetime64[ns]
 8   Text                         788516 non-null  object        
 9   Newspaper                    788516 non-null  object        
 10  Year                         788516 non-null  int64         
 11  Month                     

In [36]:
# Reg relevant articles only
df_reg=df3[df3['RegRelevance']==1]
print(df_reg.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 493418 entries, 3 to 788500
Data columns (total 30 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   ID                           493418 non-null  object        
 1   RegSentsExpand               493418 non-null  object        
 2   NounChunksMatch              493418 non-null  int64         
 3   NounChunkMatchWords          493418 non-null  object        
 4   NounChunkMatchWordsFiltered  493418 non-null  object        
 5   NounChunkMatchFiltered       493418 non-null  int64         
 6   Title                        493418 non-null  object        
 7   StartDate                    493418 non-null  datetime64[ns]
 8   Text                         493418 non-null  object        
 9   Newspaper                    493418 non-null  object        
 10  Year                         493418 non-null  int64         
 11  Month                     

In [37]:
# Calculate scores
df_reg['UncertaintyScore']=df_reg['UncertaintyCount']/df_reg['TotalWordCount']*100
for dic in ['GI','LSD','LM']:
    df_reg[dic+'score']=(df_reg[dic+'posCount']-df_reg[dic+'negCount'])/df_reg['TotalWordCount']*100
print(df_reg.info())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


<class 'pandas.core.frame.DataFrame'>
Int64Index: 493418 entries, 3 to 788500
Data columns (total 34 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   ID                           493418 non-null  object        
 1   RegSentsExpand               493418 non-null  object        
 2   NounChunksMatch              493418 non-null  int64         
 3   NounChunkMatchWords          493418 non-null  object        
 4   NounChunkMatchWordsFiltered  493418 non-null  object        
 5   NounChunkMatchFiltered       493418 non-null  int64         
 6   Title                        493418 non-null  object        
 7   StartDate                    493418 non-null  datetime64[ns]
 8   Text                         493418 non-null  object        
 9   Newspaper                    493418 non-null  object        
 10  Year                         493418 non-null  int64         
 11  Month                     

In [38]:
df_reg.to_pickle('/home/ec2-user/SageMaker/New Uncertainty/Sentiment Analysis/RegRelevant_ArticleSentimentWordsScores.pkl')

In [39]:
sentimentScores=df_reg[['ID','StartDate','Newspaper','UncertaintyScore','GIscore','LMscore','LSDscore']]
print(sentimentScores.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 493418 entries, 3 to 788500
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   ID                493418 non-null  object        
 1   StartDate         493418 non-null  datetime64[ns]
 2   Newspaper         493418 non-null  object        
 3   UncertaintyScore  493418 non-null  float64       
 4   GIscore           493418 non-null  float64       
 5   LMscore           493418 non-null  float64       
 6   LSDscore          493418 non-null  float64       
dtypes: datetime64[ns](1), float64(4), object(2)
memory usage: 30.1+ MB
None


In [40]:
sentimentScores.to_csv('/home/ec2-user/SageMaker/New Uncertainty/Sentiment Analysis/RegRelevant_ArticleSentimentScores.csv',index=False)

## 3. Calculate Monthly Average Scores

In [41]:
# Calculate average monthly scores
df_reg['ArticleCount']=1
monthlyScores=df_reg[['Year','Month','UncertaintyScore','GIscore','LSDscore','LMscore','ArticleCount']].\
    groupby(['Year','Month']).agg({'UncertaintyScore':'mean','GIscore':'mean',
                                   'LSDscore':'mean','LMscore':'mean','ArticleCount':'sum'}).reset_index()
print(monthlyScores.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 428 entries, 0 to 427
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Year              428 non-null    int64  
 1   Month             428 non-null    int64  
 2   UncertaintyScore  428 non-null    float64
 3   GIscore           428 non-null    float64
 4   LSDscore          428 non-null    float64
 5   LMscore           428 non-null    float64
 6   ArticleCount      428 non-null    int64  
dtypes: float64(4), int64(3)
memory usage: 23.5 KB
None


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [42]:
print(monthlyScores.head())

   Year  Month  UncertaintyScore   GIscore  LSDscore   LMscore  ArticleCount
0  1985      1          0.668864  1.118075 -0.004204 -1.995671           847
1  1985      2          0.654157  1.237373 -0.201235 -2.112632           772
2  1985      3          0.690767  0.977235 -0.306658 -2.174438           966
3  1985      4          0.715025  1.187243 -0.172904 -2.048845           978
4  1985      5          0.714494  1.224079  0.022134 -1.975888           982


In [43]:
monthlyScores.to_csv('/home/ec2-user/SageMaker/New Uncertainty/Sentiment Analysis/RegRelevant_MonthlySentimentScores.csv',index=False)