## Temporal Event Tracking of events in news
 - Source: https://towardsdatascience.com/natural-language-processing-event-extraction-f20d634661d3

In [162]:
import pandas as pd
import re
import unicodedata
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet 
import contractions
contractions.add("Here's", 'Here is')
contractions.add("Inc.", "Inc")

import nlpre
from nlpre import titlecaps, dedash, identify_parenthetical_phrases
from nlpre import replace_acronyms, replace_from_dictionary,url_replacement

stopwords = stopwords.words('english')
newStopWords = ['.','?','%','google','Wells Fargo','let','us','got','year','thing','would','make','time','Donald Trump','Charles Schwab','Morgan Stanley','Credit Suisse','Reuters','Bank of America','Guggenheim','Deutsch Bank','Goldman Sachs','Facebook','Fifth Third Bank','New York','Washington','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday','January','February','March','April','May','June','July','August','September','October','November','December','from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come','the']
stopwords.extend(newStopWords)
stemmer = nltk.stem.snowball.SnowballStemmer('english')
stop_list=set(stopwords)
url_replacer = url_replacement()

pd.set_option('max_colwidth',-1)

#replace newline and multi-spaces 
def clean_text_str(text):
    text = text.replace("\n"," ").replace("\t"," ").replace("\r"," ")
    text = re.sub(r" +"," ",text)
    text = text.strip()
    return text 

#clean unicode text 
def clean_unicode_text(text):
    str_test = clean_text_str(str(text))
    new_str =unicodedata.normalize("NFKD",str_test)
    return new_str 

#remove all URL references
def remove_URL(sample):
    """Remove URLs from a sample string"""
    #looks for any urls with spaces between domain.s\{1+}com
    fixspaces = re.sub(r"(?<=\.)\s+(?=com)",r'',sample)
    
    #remove https, remove wwws, and then loook for suffix's tied to websites
    return  re.sub(r"[^\s]*\.(com|org|net|gov|edu)\S*", r'', re.sub(r"www//.S+", "",re.sub(r"http\S+", "", sample)))

#remove email references
def remove_emails(text):
    return re.sub(r"\S*@\S*\s?",r'',text)

#extract phrases of text found in parenthesis
def extract_parentheses_text(text):
    return re.findall(r'\([^)]*\)',text, re.MULTILINE)

#remove the text found in parentheses
def remove_parentheses_txt(text,parentheses_txt_list):
    for val in parentheses_txt_list:
        text = text.replace(val,'')
    return text 

#cap case all full UPPER case abreviations and phrases in a pandas series that are not contained in the list of stopwords 
def capcase_abrevs(p_series,stop_list):
    out = p_series.apply(lambda x:  ' '.join([word.title()  if (word.isupper() and word.lower() not in stop_list) else 
                                               word.lower() if word.lower() in stop_list else 
                                               word for word in str(x).split() ]))
    return out 

#replace yall with you all , here's with here is , etc...
def replace_contractions(text):
    return contractions.fix(text)

#remove multiple full stops  and add space after end of sentences
def remove_multiple_periods(s):
    instr= re.sub(r'\.+', ".", s) #replace multiple periods 
    return  re.sub(r"\.+(?! )", ". ",instr) #insert space between sentences and periods 

#remove repeated words within a sentence
def remove_repeatwords(sentence):
    re_output = re.sub(r'\b(\w+)( \1\b)+', r'\1',sentence)
    return re_output

#func to compare original with processed string 
def get_string_diff(str1,str2):
    import difflib
    for line in difflib.context_diff(str1,str2):
        print(line)

def generate_stopphrases(stop_word_list):
    stop_phrases = []
    for item in stop_word_list:
        if len(item.split())>1:
            stop_phrases.append(' '.join([word.lower() for word in item.split()]))
            return stop_phrases
        
def replace_stopphrases(doc,stop_phrase_lst,replace_val = ''):
    for item in stop_phrase_lst:
        redata = re.compile(re.escape(item),re.IGNORECASE)
        doc = redata.sub(replace_val,doc)
    return doc 

import gensim 
def incorp_phrases(sent_word_tokenized_docs, min_count = 5, threshold=.9):
    word_tokenized_docs = [[item for sublist in doc for item in sublist] for doc in sent_word_tokenized_docs]
    phrases = gensim.models.phrases.Phrases(word_tokenized_docs, min_count=min_count, threshold = threshold, scoring = 'npmi')
    bigrams = gensim.models.phrases.Phraser(phrases)
    out_biphrased = bigrams[word_tokenized_docs]
    phrases2 = gensim.models.phrases.Phrases(out_biphrased, min_count=4, threshold = threshold, scoring = 'npmi')
    trigrams = gensim.models.phrases.Phraser(phrases2)
    out_trigrams = [[trigrams[word_toks] for word_toks in sent] for sent in sent_word_tokenized_docs]
    phrase_dict  = {phrase:score for phrase,score in phrases2.export_phrases(word_tokenized_docs)}
    return out_trigrams, phrase_dict

### Sentence Level Preprocssing Funcs

In [164]:
#1.sentence tokenize data 
#spacy.load('en_core_web_sm')
#python -m nltk.downloader all
#nltk.download() #C:\nltk_data

import spacy
import nltk 
import string 
import itertools
from collections import defaultdict 

nlp = spacy.load('en_core_web_sm', disable=['parser'])
word_tokenizer = nltk.tokenize.word_tokenize
sent_tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle')

def tokenize_sent(doc_str):
    return sent_tokenizer.tokenize(doc_str)

def tokenize_words(sent):
    return word_tokenizer(sent)

def lemma_sent(nlp, sentence,allowed_postags=None):
    doc = nlp(sentence)
    if allowed_postags:
        return [token.lemma_ for token in doc if token.pos_ in allowed_postags]
    else: 
        return [token.lemma_ for token in doc]       

def pos_sent(nlp,sentence):
    doc = nlp(sentence)
    return [(token.text,token.pos_) for token in doc]

def ner_sent(nlp,sentence):
    doc = nlp(sentence)
    return [(token.text,token.label_) for token in doc.ents]

def get_noun_chunks(nlp, sentence):
    doc = self.nlp(sentence)
    return [(chunk.text.chunk.root.text) for chunk in doc.noun_chunks]

def replace_stopwords(word_tokenized_sent, stop_words=None):
    return [word for word in word_tokenized_sent if str(word).lower() not in stop_words]


#map each original word to the most common stem 
def map_stems_to_orig(original_corpus, stemmer):
    counts = defaultdict(lambda: defaultdict(int))
    surface_forms={}
    for document in original_corpus:
        for token in document:
            stemmed = stemmer.stem(token)
            counts[stemmed][token]+=1
    
    for stemmed, originals in counts.items():
        surface_forms[stemmed] = max(originals, key=lambda i: originals[i])
    return surface_forms 

#remove words with len(word)<n
#remove all digits
#stem words, and create mapping of stemed words back to most representative natural word
def stem_dict_map(word_tokenized_sent_list,map_stem_dict, stemmer,min_word_len = 2):
    out = []
    for sent in word_tokenized_sent_list:
        words = []
        for word in sent:
            if len(word)>=min_word_len and not any(c.isdigit() for c in word):
                words.append(map_stem_dict[stemmer.stem(word)])
        out.append(words)
    return out 


In [168]:
df = pd.read_pickle("C:\\Users\\zjc10\\Desktop\\Projects\\data\\news\\webhose_news\\webhose_df.pickle").head(1000).reset_index(drop=True)
print(df.shape)

RangeIndex(start=0, stop=1000, step=1)

### Document level text preprocessing 
#### to do 
    - remove special chars 
    - remove occurances of blank lists (ex. Website: Twitter: Facebook: Pinterest: Goodreads:)
    - if sentence contains click here, flag it (dont remove, until we see what all is entailed with click here)
    - remove abreviations (must build list to replace)


In [169]:
#add title as first sentence 
df['new_text'] = df.apply(lambda row: row['title']+' .'+row['text'],axis=1)

#replace odd symbol used for apostrophe 
df['new_text'] = df['text'].apply(lambda x: x.replace('тАЩ',"'"))

#remove all non printable text (non ansci-ii)
df['new_text'] = df['new_text'].apply(lambda x: re.sub(r'[^\x00-\x7f]',r'', x))

#replace email and urls 
df['new_text'] = df['new_text'].apply(lambda x: remove_URL(x))
df['new_text'] = df['new_text'].apply(lambda x: remove_emails(x))

#replace double spaces, replace newline and new tab and \r references, and strip outputand normalize to unicode 
df['new_text'] = df['new_text'].apply(lambda x: clean_unicode_text(x))   

#remove contractions 
df['new_text'] = df['new_text'].apply(lambda x: replace_contractions(x))

#extract parenthesesis text 
df['parentheses_txt'] = df['new_text'].apply(lambda x:extract_parentheses_text(x))
df['new_text'] = df.apply(lambda row: remove_parentheses_txt(row['new_text'],row['parentheses_txt']),axis=1)

#replace apostrophres and multiple periods
df['new_text'] = df['new_text'].apply(lambda x: remove_multiple_periods(x.replace("'","")))

#correct ALL UPPER case 
df['new_text'] = capcase_abrevs(df['new_text'],stop_list)

#get ner information 
# df['ner_tags']= df['new_text'].apply(
#     lambda txt: [ner_sent(nlp,sent)for sent in tokenize_sent(txt)])

# df['pos_tags']= df['new_text'].apply(
#     lambda txt: [pos_sent(nlp,sent)for sent in tokenize_sent(txt)])

#### Sentence Level Processing

In [170]:
#remove repeat words and subset sent length
df['txt2model'] = df['new_text'].apply(lambda txt:[   
        ''.join(remove_repeatwords(sent))
        for sent in tokenize_sent(txt) if len(sent)>20 ]
                                      )
#remove punct
df['txt2model']  = df['txt2model'].apply(
   lambda _list: [re.sub(f'[{string.punctuation}]',' ',x) for x in _list])

#remove stopwords and clean string up 
df['txt2model']  = df['txt2model'].apply(
   lambda _list: [clean_text_str(' '.join(replace_stopwords(tokenize_words(x),stop_words = stopwords))
                        )
                  for x in _list
                 ])

#lower it 
df['txt2model']  = df['txt2model'].apply(
    lambda _list: [tokenize_words(x.lower()) for x in _list
                  ])

#stem and map stems to dict , filter common words , generate phrases , create nmf string
map_stems = map_stems_to_orig([item for sublist in list(df['txt2model']) for item in sublist],stemmer)

#if there is any processing to do at the word level (incorporate it here)
df['txt2model_stem']  = df['txt2model'].apply(lambda sent_word_toks: stem_dict_map(sent_word_toks,map_stems,stemmer))

In [325]:
#incorp phrases into sentences 
df['txt2model_phrased'],phrase_dic =  incorp_phrases(df['txt2model_stem'], threshold=.7)

In [292]:
#GENERATE NMF INPUT DICT / BOW
#iterate over docs and populate gensim dict to be converted to bow  
gensim_dict = None
docs = [[item for sublist in doc for item in sublist] for doc in df['txt2model_phrased']]
gensim_dict = create_gensim_dict(docs)

#filter out tokens taht appear in less than no_below docs and appear no more than in no_above of all docs , keeping the top keep_nmost frequent tokens 
gensim_dict.filter_extremes(no_below=2, no_above=0.1, keep_n=None)
bow_corpus = [gensim_dict.doc2bow(doc,allow_update=False) for doc in docs]



[(0, 1),
 (1, 1),
 (2, 1),
 (3, 3),
 (4, 1),
 (5, 1),
 (6, 1),
 (7, 1),
 (8, 8),
 (9, 1),
 (10, 1),
 (11, 1),
 (12, 2),
 (13, 1),
 (14, 1),
 (15, 2),
 (16, 1),
 (17, 1),
 (18, 1),
 (19, 1),
 (20, 1),
 (21, 1),
 (22, 1),
 (23, 1),
 (24, 1),
 (25, 1),
 (26, 1),
 (27, 1),
 (28, 1),
 (29, 2),
 (30, 1),
 (31, 1),
 (32, 1),
 (33, 1),
 (34, 1),
 (35, 1),
 (36, 1),
 (37, 1),
 (38, 1),
 (39, 1),
 (40, 1),
 (41, 1),
 (42, 1),
 (43, 1),
 (44, 1),
 (45, 1),
 (46, 1),
 (47, 1),
 (48, 1),
 (49, 1),
 (50, 1),
 (51, 1),
 (52, 1),
 (53, 1),
 (54, 1),
 (55, 1),
 (56, 1),
 (57, 1),
 (58, 1),
 (59, 1),
 (60, 1),
 (61, 1),
 (62, 1),
 (63, 1),
 (64, 1),
 (65, 3),
 (66, 1),
 (67, 1),
 (68, 1),
 (69, 1),
 (70, 1),
 (71, 1)]

In [327]:
#explicitly filtering out common words from each bow based on tfidf weights
import numpy as np

CORPUS = bow_corpus
low_value = .05
tfidf = gensim.models.TfidfModel(CORPUS, id2word = gensim_dict)


for i in range(0,len(CORPUS)):
    bow = bow_corpus[i]
    
    low_value_words = [] #reinitialize to be safe. You can skip this.
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    new_bow = [b for b in bow if b[0] not in low_value_words]
    
    #reassign 
    CORPUS[i] = new_bow 
    
#length of each new corpus with stop words removed 
Original = np.array(map(len,bow_corpus)) 
NoStopWrds = np.array(map(len,CORPUS))

#difference in original string vs scrubbed string
diff = Original-NoStopWrds
print("No Stop Words Found in:", len(diff[diff==0]),"Strings")
print("Stop Words removed from:",len(diff[diff>0]),"Strings")


TypeError: unsupported operand type(s) for -: 'map' and 'map'

In [207]:
#gensim dict operators 
def create_gensim_dict(word_tokenized_sents_list):
    return gensim.corpora.Dictionary(word_tokenized_sents_list)
    
def update_gensim_dict(dict2Update, word_tokenized_sents_list):
    return dict2Update.add_documents(word_tokenized_sents_list)


#convert each doc to ids_ after all words from all docs have been accounted for in gensim dict 

#gensim_dict: gensim dict capturing all words / freqs across all documents 
def dict2doc2bow_corpus(gensim_dict,word_tokenized_doc_list):
    corpus_out = [gensim_dict.doc2bow(doc,allow_update=True) for doc in word_tokenized_doc_list]
    print([[(gensim_dict[id], count) for id,count in line] for line in word_tokenized_doc_list])
    return corpus_out

#get indiv doc word freqs from dict and bowcorpp 
#ex. first_doc_word_freqs = get_doc_word_freq(gensim_dict, bow_corpus[0])
def get_doc_word_freq(gensim_dict, bow_corpus):
    return  [(gensim_dict[id], count) for id, count in bow_corpus]



#save gensim dict and corpus to disk and load 
#note: filepath must have .dict extension 
def save_gensim_dict(gensim_dict, filepath):
    gensim_dict.save(filepath)
    print('gensim dict saved at {}'.format(filepath))
    
#save corpus to dict 
#note filepath must end in .mm
def save_gensim_corp(gesim_bow_corp, filepath):
    gensim_corp.MmCorpus.serialize(filepath,gesim_bow_corp)
          
def load_gensim_dict(filepath):
    return corpora.Dictionary.load(filepath)

def load_gensim_corpus(filepath):
    return corpora.MmCorpus(filepath)
    
    
#create dictionary to filter out common terms 
#dictionary = gensim.corpora.Dictionary(df['txt2model_phrased'].astype(str))
#df['txt2model_phrased'][0]

In [326]:
def clean_docs(df=None, textcol_name = None):
    if not df:
        raise exception('error must provide dataframe')
        
    if not textcol_name: 
        #add title as first sentence 
        df['new_text'] = df.apply(lambda row: row['title']+' .'+row['text'],axis=1)
    else:
        df['new_text'] = df.apply(lambda row: row['textcol_name'],axis=1)

    #replace odd symbol used for apostrophe 
    df['new_text'] = df['text'].apply(lambda x: x.replace('тАЩ',"'"))

    #remove all non printable text (non ansci-ii)
    df['new_text'] = df['new_text'].apply(lambda x: re.sub(r'[^\x00-\x7f]',r'', x))

    #replace email and urls 
    df['new_text'] = df['new_text'].apply(lambda x: remove_URL(x))
    df['new_text'] = df['new_text'].apply(lambda x: remove_emails(x))

    #replace double spaces, replace newline and new tab and \r references, and strip outputand normalize to unicode 
    df['new_text'] = df['new_text'].apply(lambda x: clean_unicode_text(x))   

    #remove contractions 
    df['new_text'] = df['new_text'].apply(lambda x: replace_contractions(x))

    #extract parenthesesis text 
    df['parentheses_txt'] = df['new_text'].apply(lambda x:extract_parentheses_text(x))
    df['new_text'] = df.apply(lambda row: remove_parentheses_txt(row['new_text'],row['parentheses_txt']),axis=1)

    #replace apostrophres and multiple periods
    df['new_text'] = df['new_text'].apply(lambda x: remove_multiple_periods(x.replace("'","")))

    #correct ALL UPPER case 
    df['new_text'] = capcase_abrevs(df['new_text'],stop_list)

    #get ner information 
    # df['ner_tags']= df['new_text'].apply(
    #     lambda txt: [ner_sent(nlp,sent)for sent in tokenize_sent(txt)])

    # df['pos_tags']= df['new_text'].apply(
    #     lambda txt: [pos_sent(nlp,sent)for sent in tokenize_sent(txt)])

    return df 

def process_sentences(df)

    #remove repeat words and subset sent length
    df['txt2model'] = df['new_text'].apply(lambda txt:[   
            ''.join(remove_repeatwords(sent))
            for sent in tokenize_sent(txt) if len(sent)>20 ]
                                          )
    #remove punct
    df['txt2model']  = df['txt2model'].apply(
       lambda _list: [re.sub(f'[{string.punctuation}]',' ',x) for x in _list])

    #remove stopwords and clean string up 
    df['txt2model']  = df['txt2model'].apply(
       lambda _list: [clean_text_str(' '.join(replace_stopwords(tokenize_words(x),stop_words = stopwords))
                            )
                      for x in _list
                     ])

    #lower it 
    df['txt2model']  = df['txt2model'].apply(
        lambda _list: [tokenize_words(x.lower()) for x in _list
                      ])

    #stem and map stems to dict , filter common words , generate phrases , create nmf string
    map_stems = map_stems_to_orig([item for sublist in list(df['txt2model']) for item in sublist],stemmer)

    #if there is any processing to do at the word level (incorporate it here)
    df['txt2model_stem']  = df['txt2model'].apply(lambda sent_word_toks: stem_dict_map(sent_word_toks,map_stems,stemmer))
    
    return df 


In [185]:
gensim_dict = gensim.corpora.Dictionary()
len(gensim_dict)

0

In [192]:
a =gensim.corpora.Dictionary([item for sublist in df['txt2model_phrased'].head(1) for item in sublist])

In [215]:
for val in gensim_dict.items():
    print(val)

(0, 'changer')
(1, 'game')
(2, 'jami_davenport')
(3, 'live')
(4, 'anything')
(5, 'discovery')
(6, 'else')
(7, 'past')
(8, 'pulled')
(9, 'read')
(10, 'really')
(11, 'refreshing')
(12, 'story')
(13, 'unlike')
(14, 'heartstrings')
(15, 'high')
(16, 'leave')
(17, 'one')
(18, 'recommend')
(19, 'smile')
(20, 'sure')
(21, 'tug')
(22, 'buy')
(23, 'red')
(24, 'review')
(25, 'romance')
(26, 'amazon')
(27, 'battle')
(28, 'blurb')
(29, 'champion')
(30, 'collide')
(31, 'end')
(32, 'find')
(33, 'football')
(34, 'horse')
(35, 'horsewoman_stubborn')
(36, 'hunter')
(37, 'ibooks')
(38, 'kobo')
(39, 'mccoy')
(40, 'neophyte')
(41, 'nook')
(42, 'owners')
(43, 'race')
(44, 'seattle')
(45, 'sexy')
(46, 'star')
(47, 'steelheads')
(48, 'thoroughbred')
(49, 'tight')
(50, 'willing')
(51, 'beat')
(52, 'carrigans')
(53, 'derby')
(54, 'died')
(55, 'enough')
(56, 'enter')
(57, 'farm')
(58, 'feat')
(59, 'fulfillment')
(60, 'heart')
(61, 'kate')
(62, 'mother')
(63, 'order')
(64, 'save')
(65, 'vanderhof')
(66, 'wish')


In [269]:

##FILTER OUT LOW INFORMATION WORDS 
##NEED TO GET THIS WORKING
def filterCommonWords(BOWCoprus,dictionary, corpus, low_value_thresh = .02):
	from gensim.models import TfidfModel
	from gensim import models
	#filter out common words 
	#save copy of original corpus 
	CORPUS = list(BOWCoprus)

	#create td-idf model object using dictonary
	tfidf = models.TfidfModel(CORPUS, id2word = dictionary)

	#filter low value words
	low_value = low_value_thresh

	for i in range(0, len(CORPUS)):
	    bow = corpus[i]
	    low_value_words = [] #reinitialize to be safe. You can skip this.
	    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
	    new_bow = [b for b in bow if b[0] not in low_value_words]

	    #reassign        
	    CORPUS[i] = new_bow
	    
	#length of each new corpus with stop words removed 
	Original = np.array(map(len,corpus)) 
	NoStopWrds = np.array(map(len,CORPUS))

	#difference in original string vs scrubbed string
	diff = Original-NoStopWrds
	print("No Stop Words Found in:", len(diff[diff==0]),"Strings")
	print("Stop Words removed from:",len(diff[diff>0]),"Strings")

	return CORPUS   

In [249]:
docs = [[item for sublist in doc for item in sublist] for doc in df['txt2model_phrased']]

In [146]:

#preprocess docs with gensim 
stemlem_docs = df['txt2model_phrased'].astype(str).map(preprocess)
#stemlem_docs=preprocessed_docs


In [None]:
docs[0]

In [None]:
df['new_text'].tail(30)
#sample = [x for x in df['new_text']]

#findit = re.compile(r"(?<=\.)\s+(?=com)")
#findit.findall(s)

m = re.search(r"(?<=\.)\s+(?=com)",s,re.MULTILINE)
m = re.search(r'\([^)]*\)', s, re.MULTILINE)

m.span()
s[m.span()[0]-10:m.span()[1]+10]


if matches:
#   print(matches.groups())
   print(matches)
    
#    return  re.sub(r"[^\s]*\.(com|org|net|gov)\S*", r'', re.sub(r"www//.S+", "",re.sub(r"http\S+", "", fixspaces)))
#)
#s
#import dateparser
#from dateutil.parser import parse
 
#parse(s[:300], fuzzy_with_tokens=True)
#type(s)
#import phonenumbers 

# text = "Call me at 5107488230 if it's before 9:30, or on 703-4800500 after 10am."
# for match in phonenumbers.PhoneNumberMatcher(str(sample[136])):
#     print(match)
df['new_text'].tail(12)

#Is this what you desire?

#import re


#re.sub(r"\.+(?! )", ". ", 'my name is zack..i dont lik you.ur not cool')

In [329]:
df['txt2model_stem'].head(3
                         )

0    [[world, south, america, brazil, soundtrack], [okay, maybe, exact, brazillian, music, awesome, live, show, legendary], [brazil, cover, almost, half, south, america, amazon, rainforest, world, largest, jungle], [rapid, getting, cut], [country, basic, one, giant, botanical, garden, bangin, city, edge], [brazil, colon, royal, court, portugal, fleeing, napoleans, troops], [stay, long, brazil, independent, biggest, city, sao, paulo, financial, hub, south, america], [brazil, bric, brazil, russia, india, china], [four, label, world, fastest, development, large, economy], [brazil, known, three, things, amazing, beauty, women, carnival, pele, king, football, athletic, century, football, ambassador, world, declared, national, treasure], [brazillian, tell, foreign, miles, away, way, hips, move], [samba, built, soul, brazil, carnival, burst, twenty, four, hours, undying, explosive, sound, every, street], [picture, mask, dive, taipus, reefs, making, hot, salty, tears, calendar, set, january]] 