In [1]:
# library
import pandas as pd
import sklearn

In [2]:
df = pd.read_csv('/home/ec2-user/SageMaker/data/test.csv')

In [3]:
df = df[['GOID','Text','Date']]

In [4]:
df = df.dropna()

In [5]:
len(df)

1199520

In [6]:
GOID = df['GOID']
Text = df['Text']
Date = df['Date']

In [66]:
from sklearn.feature_extraction.text import CountVectorizer
transformer = CountVectorizer(binary=True, lowercase=True, stop_words='english')
import nltk
nltk.data.path.append('/home/ec2-user/SageMaker/nltk_data')
from nltk import RegexpTokenizer
tokenizer = RegexpTokenizer(r'economic\w*|economics|economist|economiz\w+|econom\w+|uncertain\w+')
from nltk import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [69]:
# require improvement for better word lemmatizing ()
# use lemmatizer instead of stemmer for better accuracy

def fun_sparse_matrix(text):
    list_words = tokenizer.tokenize(text.lower())
    result=[]
    for word in list_words:
        if lemmatizer.lemmatize(word) in (['economy','econmize','economist','economizing','economic']):
            result.append('economy')
        if lemmatizer.lemmatize(word) in (['uncertain', 'uncertainties','uncertainty','uncertainly']):
            result.append('uncertain')
    return ' '.join(result)

In [70]:
transformated_Text = Text.apply(fun_sparse_matrix)

In [71]:
transform_df = transformer.fit_transform(transformated_Text)

In [72]:
transform_df = pd.DataFrame.sparse.from_spmatrix(transform_df, columns = transformer.get_feature_names())

In [73]:
transform_df.columns

Index(['economy', 'uncertain'], dtype='object')

In [74]:
transform_df['search'] = (transform_df['economy'] + transform_df['uncertain']).astype(bool).astype(int)
transform_df.reset_index(inplace=True)

In [75]:
ECON_df = transform_df[['index', 'search']]
ECON_df = ECON_df[ECON_df['search']==1]

In [76]:
ECON_df = ECON_df.join(Text)

In [77]:
ECON_df = ECON_df.dropna()

In [78]:
ECON_df = ECON_df.join(Date)

In [79]:
ECON_df['search'].sum()

112386

In [80]:
tokenizer_letter = RegexpTokenizer(r'[A-Za-z]+')

def tokenize_letter(Text):
    list_words = tokenizer_letter.tokenize(Text.lower())
    result = []
    for word in list_words:
        result.append(lemmatizer.lemmatize(word))
    return ' '.join(result)

In [81]:
ECON_df['Text'] = ECON_df['Text'].apply(tokenize_letter)
transform_econ_df = transformer.fit_transform(ECON_df['Text'])
transform_econ_df

<112386x1755622 sparse matrix of type '<class 'numpy.int64'>'
	with 26592823 stored elements in Compressed Sparse Row format>

In [82]:
transform_econ_df = pd.DataFrame.sparse.from_spmatrix(transform_econ_df, columns = transformer.get_feature_names())

In [83]:
from nltk.corpus import wordnet as wn

In [84]:
list_words = []
for word in transform_econ_df:
    if len(wn.synsets(word))!=0:
        list_words.append(word)

len(list_words)

60812

In [125]:
transform_econ_df = transform_econ_df[list_words]
transform_econ_df.shape

(112386, 60812)

In [150]:
result_df = transform_econ_df[['policy','legislation','politics','congress', 'deficit', 'federal','regulation','white','car', 'eat']]

In [162]:
from sklearn.preprocessing import PolynomialFeatures
poly_trans = PolynomialFeatures(degree=8, interaction_only=True)

In [163]:
trans_df = poly_trans.fit_transform(result_df)

In [164]:
trans_df = pd.DataFrame.sparse.from_spmatrix(trans_df, columns = poly_trans.get_feature_names(result_df.columns))

In [165]:
trans_df

Unnamed: 0,1,policy,legislation,politics,congress,deficit,federal,regulation,white,car,...,policy congress deficit federal regulation white car eat,legislation politics congress deficit federal regulation white car,legislation politics congress deficit federal regulation white eat,legislation politics congress deficit federal regulation car eat,legislation politics congress deficit federal white car eat,legislation politics congress deficit regulation white car eat,legislation politics congress federal regulation white car eat,legislation politics deficit federal regulation white car eat,legislation congress deficit federal regulation white car eat,politics congress deficit federal regulation white car eat
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112381,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
112382,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
112383,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
112384,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [166]:
trans_df['Date'] = ECON_df['Date']

In [167]:
index_df = trans_df.groupby(pd.to_datetime(trans_df['Date']).dt.month).sum()

In [168]:
index_df

Unnamed: 0_level_0,1,policy,legislation,politics,congress,deficit,federal,regulation,white,car,...,policy congress deficit federal regulation white car eat,legislation politics congress deficit federal regulation white car,legislation politics congress deficit federal regulation white eat,legislation politics congress deficit federal regulation car eat,legislation politics congress deficit federal white car eat,legislation politics congress deficit regulation white car eat,legislation politics congress federal regulation white car eat,legislation politics deficit federal regulation white car eat,legislation congress deficit federal regulation white car eat,politics congress deficit federal regulation white car eat
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,9308.0,674.0,164.0,360.0,407.0,164.0,868.0,193.0,1190.0,943.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,8968.0,626.0,153.0,319.0,323.0,154.0,774.0,187.0,1148.0,886.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,9868.0,755.0,216.0,363.0,401.0,166.0,859.0,194.0,1254.0,978.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,9324.0,657.0,172.0,312.0,375.0,139.0,832.0,201.0,1201.0,969.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,9545.0,676.0,188.0,332.0,369.0,141.0,808.0,181.0,1258.0,1008.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,9165.0,686.0,177.0,338.0,357.0,109.0,795.0,180.0,1257.0,905.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,8678.0,653.0,186.0,306.0,377.0,102.0,825.0,185.0,1184.0,944.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,8642.0,635.0,124.0,318.0,297.0,116.0,813.0,159.0,1123.0,930.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,9628.0,701.0,173.0,379.0,361.0,115.0,864.0,181.0,1358.0,897.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,9900.0,739.0,153.0,380.0,363.0,156.0,910.0,202.0,1334.0,999.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
