In [134]:
import pandas as pd
import numpy as np
import nltk
import string

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

In [123]:
data = [['\tthis is fiscal, projection', '2019-10-15'], ['the, world is going down\n', '2019-10-16'], ['I am going to talk about fiscal policies\n', '2019-10-17']] 
df = pd.DataFrame(data, columns=['text', 'date']) 

In [124]:
df['words_count_raw'] = df['text'].apply(lambda x: len(str(x).split()) )

In [125]:
def preprocess(text):
    text = text.replace('\n', '')
    text = text.replace('\t', '')
    return text



In [126]:
# df = df.drop('word-count', 1)
# df.drop('word-count', axis=1, inplace=True)

In [127]:
from datetime import datetime as dt

df['date'] = df['date'].apply(lambda x: dt.strptime(x, '%Y-%m-%d'))

In [128]:
df['text'] = df['text'].apply(preprocess)

In [129]:
tokenizer = RegexpTokenizer(r'\w+')

df['text'] = df['text'].apply(lambda x: tokenizer.tokenize(x.lower()))

In [130]:
df['text'] = df['text'].apply(lambda x: [w for w in x if w not in stopwords.words('english')])

In [132]:
df['words_count_processed'] = df['text'].apply(lambda x: len(str(x).split()) )

In [133]:
df

Unnamed: 0,text,date,words_count_raw,words_count_processed
0,"[fiscal, projection]",2019-10-15,4,2
1,"[world, going]",2019-10-16,5,2
2,"[going, talk, fiscal, policies]",2019-10-17,8,4


In [146]:
all_sentences = []

[all_sentences.append(term) for row in df['text'] for term in row]

all_sentences

['fiscal',
 'projection',
 'world',
 'going',
 'going',
 'talk',
 'fiscal',
 'policies']

In [135]:
from sklearn.feature_extraction.text import CountVectorizer

In [136]:
count_vectorizer = CountVectorizer()

In [148]:
bow = count_vectorizer.fit_transform(all_sentences)


In [151]:
bow.toarray().sum(axis=0)

array([2, 2, 1, 1, 1, 1], dtype=int64)

In [139]:
corpus = [["this is spam, 'SPAM'"],["this is ham, 'HAM'"],["this is nothing, 'NOTHING'"]]


TypeError: fit_transform() got an unexpected keyword argument 'tokenizer'

In [193]:
new_df = df[ ['text', 'words_count_processed'] ].copy()
concatenated_df = pd.concat([df, new_df], keys=['old', 'new'], ignore_index=False, sort=False)
concatenated_df

Unnamed: 0,Unnamed: 1,text,date,words_count_raw,words_count_processed
old,0,"[fiscal, projection]",2019-10-15,4.0,2
old,1,"[world, going]",2019-10-16,5.0,2
old,2,"[going, talk, fiscal, policies]",2019-10-17,8.0,4
new,0,"[fiscal, projection]",NaT,,2
new,1,"[world, going]",NaT,,2
new,2,"[going, talk, fiscal, policies]",NaT,,4


In [199]:
from collections import Counter
bow_counter = [Counter(row) for row in concatenated_df['text']  ]

In [200]:
bow_counter

[Counter({'fiscal': 1, 'projection': 1}),
 Counter({'world': 1, 'going': 1}),
 Counter({'going': 1, 'talk': 1, 'fiscal': 1, 'policies': 1}),
 Counter({'fiscal': 1, 'projection': 1}),
 Counter({'world': 1, 'going': 1}),
 Counter({'going': 1, 'talk': 1, 'fiscal': 1, 'policies': 1})]

In [201]:
bow = sum(bow_counter, Counter())

In [202]:
bow

Counter({'fiscal': 4,
         'projection': 2,
         'world': 2,
         'going': 4,
         'talk': 2,
         'policies': 2})

In [198]:
elements = bow.elements()
list(elements)

['fiscal',
 'fiscal',
 'projection',
 'world',
 'going',
 'going',
 'talk',
 'policies']

In [165]:
most_common = bow_sum.most_common(10)
most_common

[('fiscal', 2),
 ('going', 2),
 ('projection', 1),
 ('world', 1),
 ('talk', 1),
 ('policies', 1)]

In [167]:
total = sum(bow_sum.values())
print('Total number of words: %d' % total)

Total number of words: 8


In [172]:
for (name, count) in sorted(bow_sum.items()):
    print(f'{name}: {count}')

fiscal: 2
going: 2
policies: 1
projection: 1
talk: 1
world: 1


In [204]:
from nltk.stem.porter import PorterStemmer

In [218]:
porter_stemmer = PorterStemmer()

In [225]:
plurals = ['caresses', 'flies', 'dies', 'mules', 'denied',
...            'died', 'agreed', 'owned', 'humbled', 'sized',
...            'meeting', 'stating', 'siezing', 'itemization',
...            'sensational', 'traditional', 'reference', 'colonizer',
...            'plotted', 'table\'s', 'having']

In [226]:
singles = [porter_stemmer.stem(plural) for plural in plurals]

In [227]:
singles

['caress',
 'fli',
 'die',
 'mule',
 'deni',
 'die',
 'agre',
 'own',
 'humbl',
 'size',
 'meet',
 'state',
 'siez',
 'item',
 'sensat',
 'tradit',
 'refer',
 'colon',
 'plot',
 "table'",
 'have']

In [213]:
from nltk.stem.snowball import SnowballStemmer

In [215]:
print(" ".join(SnowballStemmer.languages))

arabic danish dutch english finnish french german hungarian italian norwegian porter portuguese romanian russian spanish swedish


In [222]:
snowball_stemmer = SnowballStemmer("english",ignore_stopwords=True)

In [228]:
singles = [snowball_stemmer.stem(plural) for plural in plurals]

In [229]:
singles

['caress',
 'fli',
 'die',
 'mule',
 'deni',
 'die',
 'agre',
 'own',
 'humbl',
 'size',
 'meet',
 'state',
 'siez',
 'item',
 'sensat',
 'tradit',
 'refer',
 'colon',
 'plot',
 'tabl',
 'having']

In [361]:
import json

teststr = '[{"date": "2019-10-10", "statement":"statement1","questions": [{"n": "1"}, {"n": "2"}] } , {"date": "2019-10-11", "statement":"statement2","questions": [{"n": "3"}, {"n": "5"}] } ]'

json_obj = json.loads(teststr)

In [112]:
import json
from pandas.io.json import json_normalize
import pandas as pd

In [188]:
df = json_normalize(json_obj)
df.drop( columns=['questions'], inplace=True)

In [189]:
df.set_index(df['date'], inplace=True)

In [190]:
df.drop(columns=['date'], inplace=True)

In [163]:
df = df.reset_index()

In [191]:
df.index

Index(['2019-10-10', '2019-10-11'], dtype='object', name='date')

In [203]:
statements_df = df.groupby(df.index).count()[['statement']] #returns dataframe object
statements_series = df.groupby('date').count().statement #returns series object

In [197]:
type(statements_df)

pandas.core.frame.DataFrame

In [136]:
df_with_questions = json_normalize(json_obj, meta=['date'], record_path='questions')
df_with_questions.set_index(df_with_questions['date'])
df_with_questions

Unnamed: 0,n,date
0,1,2019-10-10
1,2,2019-10-10
2,3,2019-10-11
3,5,2019-10-11


In [128]:
results_df = pd.merge(df, df_with_questions, on='date', how='outer')

In [129]:
results_df

Unnamed: 0,date,statement,n
0,2019-10-10,statement1,1
1,2019-10-10,statement1,2
2,2019-10-11,statement2,3
3,2019-10-11,statement2,5


## Setting up operation for inner json

In [482]:
from pandas.io.json import json_normalize
import pandas as pd

Unnamed: 0,date,questions,statement,url
0,2019-10-10,"[{'speaker2': 'This is something to talk'}, {'...",statement1,http://home.com/1
1,2019-10-11,"[{'speaker2': 'And what if we'}, {'speaker1': ...",statement2,http://home.com/2


In [565]:
df

Unnamed: 0,date,questions,statement,url
0,2019-10-10,[{'speaker1': 'Another point'}],statement1,http://home.com/1
1,2019-10-11,[{'speaker1': 'Listenering sdsd for is when'}],statement2,http://home.com/2


In [264]:
df_with_questions = json_normalize(json_obj, meta=['date'], record_path='questions')
df_with_questions.set_index(df_with_questions['date'], inplace=True)
df_with_questions.drop( columns=['date'], inplace=True)
df_with_questions

Unnamed: 0_level_0,q
date,Unnamed: 1_level_1
2019-10-10,dfd dfd
2019-10-10,fsdf dfusf
2019-10-11,fdd dsds the
2019-10-11,dfsj asd


In [265]:
# concatenate
df = pd.concat([df,df_with_questions], sort=False)
df

Unnamed: 0_level_0,statement,q
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-10-10,statement1,
2019-10-11,statement2,
2019-10-10,,dfd dfd
2019-10-10,,fsdf dfusf
2019-10-11,,fdd dsds the
2019-10-11,,dfsj asd


In [366]:
df

Unnamed: 0_level_0,statement,q
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-10-10,statement1,
2019-10-11,statement2,
2019-10-10,,dfd dfd
2019-10-10,,fsdf dfusf
2019-10-11,,fdd dsds the
2019-10-11,,dfsj asd


In [278]:
results_df.describe(include='all')

Unnamed: 0_level_0,statement,statement,statement,statement
Unnamed: 0_level_1,count,unique,top,freq
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2019-10-10,1,1,statement1,1
2019-10-11,1,1,statement2,1


In [237]:
results_df.describe()

Unnamed: 0_level_0,statement,statement,statement,statement,n,n,n,n
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
2019-10-10,1,1,statement1,1,2,2,1,1
2019-10-11,1,1,statement2,1,2,2,3,1


In [289]:
results_df =df.groupby('date')[['q','statement']] 

In [298]:
df.groupby('date')[['q']].get_group('2019-10-11')

Unnamed: 0_level_0,q
date,Unnamed: 1_level_1
2019-10-11,
2019-10-11,fdd dsds the
2019-10-11,dfsj asd


Unnamed: 0_level_0,statement
date,Unnamed: 1_level_1
2019-10-10,"[statement1, nan, nan]"
2019-10-11,"[statement2, nan, nan]"


In [273]:
results_df =df.groupby('date')[['statement']]

In [334]:
results_df.get_group('2019-10-11')

Unnamed: 0_level_0,q,statement
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-10-11,,statement2
2019-10-11,fdd dsds the,
2019-10-11,dfsj asd,


In [445]:
import numpy as np
q_series = df.groupby('date')['q'].apply(lambda v: v.tolist() )
statement_series = df.groupby('date')['statement'].apply(lambda v: v.tolist() )

q_df = pd.DataFrame(q_series) # creating a dataframe from grouped series to merge
statement_df = pd.DataFrame(statement_series)

KeyError: 'Column not found: q'

In [336]:
statement_df

Unnamed: 0_level_0,statement
date,Unnamed: 1_level_1
2019-10-10,"[statement1, nan, nan]"
2019-10-11,"[statement2, nan, nan]"


In [337]:
q_df

Unnamed: 0_level_0,q
date,Unnamed: 1_level_1
2019-10-10,"[nan, dfd dfd, fsdf dfusf]"
2019-10-11,"[nan, fdd dsds the, dfsj asd]"


In [356]:
merged_df = pd.merge(statement_df, q_df, how='left', on='date')

In [360]:
merged_df

Unnamed: 0_level_0,statement,q
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-10-10,[statement1],"[dfd dfd, fsdf dfusf]"
2019-10-11,[statement2],"[fdd dsds the, dfsj asd]"


In [359]:

merged_df['statement'] = merged_df['statement'].apply(lambda x: [i for i in x if str(i) != ("nan" or "null")])
merged_df['q'] = merged_df['q'].apply(lambda x: [i for i in x if str(i) != ("nan" or "null")])


### Iteration with dictionaries

In [789]:
import json

teststr = '[{"date": "2019-10-10", "statement":"statement1","questions": [{"speaker1": "This is something to talk about fiscal policies"}, {"speaker1": "Another point about fiscal policies"}], "url":"http://home.com/1" } , {"date": "2019-10-11", "statement":"statement2","questions": [{"speaker2": "And what if we"}, {"speaker1": "Listenering sdsd for is when"}], "url":"http://home.com/2" } ]'

json_obj = json.loads(teststr)

In [790]:
speaker_names = ['speaker1']
df = json_normalize(json_obj)
# df.set_index(df['date'], inplace=True)
# df.drop(columns=['questions', 'date'], inplace=True)
df

Unnamed: 0,date,questions,statement,url
0,2019-10-10,[{'speaker1': 'This is something to talk about...,statement1,http://home.com/1
1,2019-10-11,"[{'speaker2': 'And what if we'}, {'speaker1': ...",statement2,http://home.com/2


In [791]:
speaker_names = ['speaker1']

def delete_unmatched(d:dict, l = ['speaker1']):
    values = []
    for num, i in enumerate(d):
        for k, v in i.items():
            if k not in speaker_names:
                del d[num]
    return d


def flatten_matched(list_of_dict:list, l = ['speaker1']):
    assert isinstance(list_of_dict, list)
    values = []
    for i in list_of_dict:
        if not isinstance(i, (dict)):
            return list_of_dict
        
        for k, v in i.items():
            if k in speaker_names:
                values.append(v)
    return ' '.join(s for s in values)


# [delete_unmatched(d, speaker_names) for d in df['questions']  ]

# df['questions'] = df['questions'].apply(delete_unmatched)
df['questions'] = df['questions'].apply(flatten_matched)

In [792]:
df

Unnamed: 0,date,questions,statement,url
0,2019-10-10,This is something to talk about fiscal policie...,statement1,http://home.com/1
1,2019-10-11,Listenering sdsd for is when,statement2,http://home.com/2


## Pre-processing

In [793]:
from datetime import datetime as dt

df['date'] = df['date'].apply(lambda x: dt.strptime(x, '%Y-%m-%d'))

In [794]:

df['questions_tokens'] = df['questions'].apply(lambda x: word_tokenize(x.lower()))


In [796]:
df['questions_tokens'] = df['questions_tokens'].apply(lambda x: [w for w in x if w not in stopwords.words('english')])


In [756]:
from nltk.stem.snowball import SnowballStemmer

snowball_stemmer = SnowballStemmer("english",ignore_stopwords=True)


df['token_stemmed'] = df['questions_tokens'].apply(lambda row: [snowball_stemmer.stem(s) for s in row] )


In [757]:
df['bigrams_stemmed'] = df['questions_tokens'].apply(lambda row: list(ngrams(row,2)) )


In [797]:
from nltk.stem import WordNetLemmatizer 

lemmatizer = WordNetLemmatizer() 

df['token_lemmatized'] = df['questions_tokens'].apply(lambda row: [lemmatizer.lemmatize(s) for s in row] )

In [905]:
import re
lemmatizer = WordNetLemmatizer() 

def generate_ngrams(s, n):
    # Convert to lowercases
    s = s.lower()
    
    # Replace all none alphanumeric characters with spaces
    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
    
    # Break sentence in the token, remove empty tokens
#     tokens = [token for token in s.split(" ") if token != ""]
    tokens = [w for w in s.split(" ") if w not in stopwords.words('english')]
    lemmatized_tokens = [lemmatizer.lemmatize(s) for s in tokens]
    
    # Use the zip function to help us generate n-grams
    # Concatentate the tokens into ngrams and return
    ngrams = zip(*[lemmatized_tokens[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]


None


In [906]:
df['unigrams'] = df['questions'].apply(lambda row: generate_ngrams(row, 1) )
df['bigrams'] = df['questions'].apply(lambda row: generate_ngrams(row, 2) )



In [907]:
df

Unnamed: 0,date,questions,statement,url,questions_tokens,token_lemmatized,bigrams_stemmed,bigrams_lemmatized,unigrams,bigrams
0,2019-10-10,This is something to talk about fiscal policie...,statement1,http://home.com/1,"[something, talk, fiscal, policies, another, p...","[something, talk, fiscal, policy, another, poi...","[(something, talk), (talk, fiscal), (fiscal, p...","[something talk, talk fiscal, fiscal policy, p...","[something, talk, fiscal, policy, another, poi...","[something talk, talk fiscal, fiscal policy, p..."
1,2019-10-11,Listenering sdsd for is when,statement2,http://home.com/2,"[listenering, sdsd]","[listenering, sdsd]","[(listenering, sdsd)]",[listenering sdsd],"[listenering, sdsd]",[listenering sdsd]


In [940]:
# for (columnName, columnData) in df.iteritems():
#    print('Colunm Name : ', columnName)
#    print('Column Contents : ', columnData.values)

[ columnData.values for (columnName, columnData) in df.iteritems() if columnName in ('bigrams', 'unigrams')]



[array([list(['something', 'talk', 'fiscal', 'policy', 'another', 'point', 'fiscal', 'policy']),
        list(['listenering', 'sdsd'])], dtype=object),
 array([list(['something talk', 'talk fiscal', 'fiscal policy', 'policy another', 'another point', 'point fiscal', 'fiscal policy']),
        list(['listenering sdsd'])], dtype=object)]

In [954]:
from collections import Counter

tokens_counter = [Counter(generate_ngrams(columnData.values[0], 1))\
                  for (columnName, columnData) in df.iteritems() \
                  if columnName in ('statement', 'questions')] 

bow_counter = [Counter(columnData.values[0]) \
               for (columnName, columnData) in df.iteritems() \
               if columnName in ('bigrams', 'unigrams')]

bow_tokens = sum(tokens_counter, Counter())
bow = sum(bow_counter, Counter())

bow_tokens

Counter({'something': 1,
         'talk': 1,
         'fiscal': 2,
         'policy': 2,
         'another': 1,
         'point': 1,
         'statement1': 1})

In [955]:
total = sum(bow_tokens.values())
print('Total number of words: %d' % total)

Total number of words: 9


In [958]:
def get_bow(df):
    for i in df.index:
        val = df.get_value(i,'bigrams')
        for 
        print(val)

get_bow(df)
        

['something talk', 'talk fiscal', 'fiscal policy', 'policy another', 'another point', 'point fiscal', 'fiscal policy']
['listenering sdsd']


  This is separate from the ipykernel package so we can avoid doing imports until


In [941]:
for (name, count) in sorted(bow.items()):
    print(f'{name}: {count}')

another: 1
another point: 1
fiscal: 2
fiscal policy: 2
point: 1
point fiscal: 1
policy: 2
policy another: 1
something: 1
something talk: 1
talk: 1
talk fiscal: 1


In [959]:
keyword = 'another'

lemmatizer = WordNetLemmatizer() 
keyword = [ lemmatizer.lemmatize(s) for s in keyword.split() ]
keyword = " ".join(keyword)

keyword

'another'

In [960]:

bow[keyword]

1

In [787]:
import nltk
from nltk import word_tokenize
from nltk.util import ngrams
from collections import Counter

text = "I need to write a program in NLTK that breaks a corpus (a large collection of \
txt files) into unigrams, bigrams, trigrams, fourgrams and fivegrams. I need to write a program in NLTK that breaks a corpus"
token = nltk.word_tokenize(text)

bigrams = ngrams(token,2)
trigrams = ngrams(token,3)
fourgrams = ngrams(token,4)
fivegrams = ngrams(token,5)

frequencies = Counter([])
frequencies += Counter(bigrams)

In [788]:
print(frequencies)

Counter({('I', 'need'): 2, ('need', 'to'): 2, ('to', 'write'): 2, ('write', 'a'): 2, ('a', 'program'): 2, ('program', 'in'): 2, ('in', 'NLTK'): 2, ('NLTK', 'that'): 2, ('that', 'breaks'): 2, ('breaks', 'a'): 2, ('a', 'corpus'): 2, ('corpus', '('): 1, ('(', 'a'): 1, ('a', 'large'): 1, ('large', 'collection'): 1, ('collection', 'of'): 1, ('of', 'txt'): 1, ('txt', 'files'): 1, ('files', ')'): 1, (')', 'into'): 1, ('into', 'unigrams'): 1, ('unigrams', ','): 1, (',', 'bigrams'): 1, ('bigrams', ','): 1, (',', 'trigrams'): 1, ('trigrams', ','): 1, (',', 'fourgrams'): 1, ('fourgrams', 'and'): 1, ('and', 'fivegrams'): 1, ('fivegrams', '.'): 1, ('.', 'I'): 1})


In [733]:
print(Counter(bigrams))

Counter({('I', 'need'): 2, ('need', 'to'): 2, ('to', 'write'): 2, ('write', 'a'): 2, ('a', 'program'): 2, ('program', 'in'): 2, ('in', 'NLTK'): 2, ('NLTK', 'that'): 2, ('that', 'breaks'): 2, ('breaks', 'a'): 2, ('a', 'corpus'): 2, ('corpus', '('): 1, ('(', 'a'): 1, ('a', 'large'): 1, ('large', 'collection'): 1, ('collection', 'of'): 1, ('of', 'txt'): 1, ('txt', 'files'): 1, ('files', ')'): 1, (')', 'into'): 1, ('into', 'unigrams'): 1, ('unigrams', ','): 1, (',', 'bigrams'): 1, ('bigrams', ','): 1, (',', 'trigrams'): 1, ('trigrams', ','): 1, (',', 'fourgrams'): 1, ('fourgrams', 'and'): 1, ('and', 'fivegrams'): 1, ('fivegrams', '.'): 1, ('.', 'I'): 1})


In [895]:

# http://www.albertauyeung.com/post/generating-ngrams-python/
import re

def generate_ngrams(s, n):
    # Convert to lowercases
    s = s.lower()
    
    # Replace all none alphanumeric characters with spaces
    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
    
    # Break sentence in the token, remove empty tokens
    tokens = [token for token in s.split(" ") if token != ""]
    
    # Use the zip function to help us generate n-grams
    # Concatentate the tokens into ngrams and return
    ngrams = zip(*[tokens[i:] for i in range(n)])
    
    return [" ".join(ngram) for ngram in ngrams]

def ngrams_range(text,end):
    v = []
    for i in range(end):
        s = generated_bigrams(text, i )
        v.append(s)
    return v

# generated_unigrams = generate_ngrams(text, 1)
generated_bigrams = generate_ngrams(text, 2)

print(generated_unigrams)


['i', 'need', 'to', 'write', 'a', 'program', 'in', 'nltk', 'that', 'breaks', 'a', 'corpus', 'a', 'large', 'collection', 'of', 'txt', 'files', 'into', 'unigrams', 'bigrams', 'trigrams', 'fourgrams', 'and', 'fivegrams', 'i', 'need', 'to', 'write', 'a', 'program', 'in', 'nltk', 'that', 'breaks', 'a', 'corpus']


In [872]:
from nltk.tokenize import word_tokenize
from nltk.util import ngrams

def get_ngrams(text, n ):
    n_grams = []
    for i in range(n):
        n_grams.append(ngrams(word_tokenize(text), n))
        
    return [ ' '.join(grams) for grams in n_grams]


    

IndentationError: expected an indented block (<ipython-input-872-8d5383705463>, line 14)