#### Setup

In [None]:
!pip install nltk

In [None]:
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
en_stops = set(stopwords.words('english'))

#### Import Dataset

In [None]:
df = pd.read_csv('articles-titles.csv')

#### Split and Clean

In [None]:
def split_words(df, min_len):
    '''splitting the title to individual words'''
    
    words = []

    for word in df['title']:
        for one in str(word).split():
            lower = one.lower()
            reg = re.sub('[\W\_]','',lower) # remove puncuation
            if len(reg) >= min_len: # removing words less than input parameter
                words.append(reg)
                
    return words

In [None]:
words = split_words(df, 4)

In [None]:
def remove_stop_words(words):
    '''Removes common words like a, the, and, etc... Main English words'''
    
    words_no_stops = []
    
    for word in words:
        if word not in en_stops:
            words_no_stops.append(word)
    
    return words_no_stops

In [None]:
words_no_stops = remove_stop_words(words)

#### Get top words and save as a dataset

In [None]:
def top_words(words=words_no_stops, top=100):
    '''get top words by passing a parameter for the amount, or default to 100'''
    
    words_df = pd.DataFrame(words_no_stops, columns=['word'])
    words_group = words_df.groupby('word')['word'].count().sort_values().to_frame()
    words_group.rename(columns={'word':'count'}, inplace=True)
    top_words = words_group.tail(top)
    
    return top_words

In [None]:
top_n = top_words(words_no_stops, 100)

#### Save to db, or export to csv

In [None]:
# writes output back to Domo
top_n.to_csv('export.csv')