In [1]:
from tqdm import tqdm


import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import matplotlib.pyplot as plt
from gensim.parsing.preprocessing import STOPWORDS
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

import spacy
import re

<h1>DATABASE<h1>

In [30]:
%run ../database.ipynb

NLTK download

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('brown')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\wwwhh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\wwwhh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\wwwhh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\wwwhh\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\wwwhh\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

Query database

In [9]:
filter_words = ['china', 'chinese', 'sumerian', 'sumerians', 'sumer', 'shang', 'xia', 'cuneiform']

stop_words = set(stopwords.words('english'))
gensim_stopwords = list(STOPWORDS)
sklearn_stopwords = list(ENGLISH_STOP_WORDS)
custom_stopwords = set(['going', 'im', 'one', 'go', 'people', 'say', 'get', 'know', 'think', 'like', 'want', 'make', 'thing', 'good', 'really', 'time', 'look', 'come', 'see', 'would', 'could', 'may', 'might', 'must', 'shall', 'should', 'will', 'ought', 'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', '"', "'", '“', '”', '’', '‘', '—', '–', '...', '…', '•', '·', '÷', "'s", "\n", "\t", '’s'])
stop_words = stop_words.union(custom_stopwords)
stop_words = stop_words.union(gensim_stopwords)
stop_words = stop_words.union(sklearn_stopwords)
noun_phrase_stop_words = set(gensim_stopwords + sklearn_stopwords)

nlp = spacy.load("en_core_web_sm")


def deduplicate(list_of_dicts):
    deduplicated = []
    text_seen = set()
    for d in list_of_dicts:
        if "text" not in d.keys():
            continue
        if d['text'] not in text_seen:
            text_seen.add(d['text'])
            deduplicated.append(d)
            
    return deduplicated

def filter(tokens):
    return any(word in filter_words for word in tokens)

def filtered_noun_phrases(stop_words, noun_phrases):
    filtered_noun_phrases = []


    for noun_phrase in noun_phrases:
        word = noun_phrase
        for stop_word in stop_words:
            if stop_word in noun_phrase:
                word_to_remove = rf"\b{stop_word}\b"
                word = re.sub(word_to_remove, "", word).strip()
        
        count = len(word.split())
        if count <= 1:
            continue
        else:
            filtered_noun_phrases.append(word)
    return filtered_noun_phrases

def lemmatize(word):
    return nlp(word)[0].lemma_

def keep_english_only(text):
    english_pattern = r'\b[a-zA-Z]+\b'
    
    english_words = re.findall(english_pattern, text)
    cleaned_text = ' '.join(english_words)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    
    return cleaned_text

def tokenize(text):
    
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+|\S+@\S+|[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = keep_english_only(text)
    
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    
    # if not filter(tokens):
    #     return ""

    doc = nlp(text)
    noun_phrases = [np.text for np in doc.noun_chunks]
    noun_phrases = filtered_noun_phrases(noun_phrase_stop_words, noun_phrases)
    tokens += noun_phrases
    
    result = []
    for token in tokens:
        if len(token.split()) <= 1: # single word
            result.append(lemmatize(token))
        else: # noun phrase
            temp = ""
            for word in token.split():
                temp += lemmatize(word) + " "
            result.append(temp.strip())
    
    return result

<h1>Add a new column for preprocessed data<h1>

In [5]:
def word_count(df):
    df['word_count'] = df['text'].apply(lambda x: len(x.split()))
    bins = [0, 100, 200, 500, 1000, 2000, 5000, 7500, 15000]
    labels = ['0-100', '101-200', '201-500', '501-1000', '1001-2000', '2001-5000', '5001-7500', '7501+']
    df['word_count_category'] = pd.cut(df['word_count'], bins=bins, labels=labels, right=False)
    return df

In [10]:
original_deduplication = deduplicate(get_all_content())
original_deduplication_without_filtering = deduplicate(get_all_content())

In [11]:
for i in tqdm(range(len(original_deduplication))):
    original_deduplication[i]['preprocessed_token'] = tokenize(original_deduplication[i]['text'])

100%|██████████| 11769/11769 [1:09:01<00:00,  2.84it/s]


In [8]:
# for i in tqdm(range(len(original_deduplication))):
#     original_deduplication_without_filtering[i]['preprocessed_text'] = tokenize(original_deduplication[i]['text'])

In [18]:
import pandas as pd

In [19]:
df_filtering = pd.DataFrame(original_deduplication)
df_filtering = word_count(df_filtering)
df_filtering = df_filtering[df_filtering['preprocessed_token'] != '']
df_filtering = df_filtering[df_filtering['preprocessed_token'] != None]

# df_without_filtering = pd.DataFrame(original_deduplication_without_filtering)
# df_without_filtering = word_count(df_without_filtering)
# df_without_filtering.rename(columns={'preprocessed_text': 'preprocessed_token'}, inplace=True)

In [24]:
df_filtering["preprocessed_text"] = df_filtering["preprocessed_token"].apply(lambda x: ' '.join(x))
# df_without_filtering["preprocessed_text"] = df_filtering["preprocessed_token"].apply(lambda x: ' '.join(x))

In [25]:
df_filtering['preprocessed_text'][0]

'story noahs flood primarily derive biblical narrative book genesis ancient culture flood myth absence direct account noahs flood record early civilization mention attribute factorsncultural context flood narrative bible specific hebrew tradition civilization mythology include flood story epic gilgamesh mesopotamia narrative reflect culture value historical experience society produce universal accountsngeographical difference civilization mention locate different region distinct environmental geological context flooding event significant catastrophic area warrant last narrative noahs flood example nile river predictable flooding essential egyptian agriculture view positively destructive forcenhistorical documentation recordkeepe practice varied significantly civilization sumerian egyptian advance write system record focus political economic religious matter mythological event absence record necessarily imply event occur simply reflect deem significant documentnmythological syncretism f

In [26]:
col = ['_id', 'text', 'search_keyword', 
        'url', 'source', 'preprocessed_text', 
        'preprocessed_token', 'word_count', 'word_count_category']

In [27]:
df_filtering = df_filtering[col]
# df_without_filtering = df_without_filtering[col]

In [33]:
for index, row in df_filtering.iterrows():
    insert_final_content_table(row.to_dict())

Inserted document with ID: 671bb15ac146c0252b26a9c8
Inserted document with ID: 671bb15bc146c0252b26a9ca
Inserted document with ID: 671bb15cc146c0252b26a9cc
Inserted document with ID: 671bb15dc146c0252b26a9ce
Inserted document with ID: 671bb15ec146c0252b26a9d0
Inserted document with ID: 671bb15fc146c0252b26a9d2
Inserted document with ID: 671bb160c146c0252b26a9d4
Inserted document with ID: 671bb162c146c0252b26a9d6
Inserted document with ID: 671bb163c146c0252b26a9d8
Inserted document with ID: 671bb164c146c0252b26a9da
Inserted document with ID: 671bb164c146c0252b26a9dc
Inserted document with ID: 671bb165c146c0252b26a9de
Inserted document with ID: 671bb166c146c0252b26a9e0
Inserted document with ID: 671bb167c146c0252b26a9e2
Inserted document with ID: 671bb168c146c0252b26a9e4
Inserted document with ID: 671bb169c146c0252b26a9e6
Inserted document with ID: 671bb16ac146c0252b26a9e8
Inserted document with ID: 671bb16bc146c0252b26a9ea
Inserted document with ID: 671bb16cc146c0252b26a9ec
Inserted doc