## Obtain Sentence Length in terms of words

For each chat message, calculate the number of words, disregarding any group of characters made up of only punctuation marks. eg. emojis and lone question marks

In [1]:
import gzip
import string

from os.path import join

In [8]:
PUNCTUATION_SET = set(string.punctuation)
CHAT_LOG_FILE = join("..", "data_files", "gnue_irc_chat_logs_preprocessed.txt.gz")
FEATURE_OUTPUT_DIR = join("..", "feature_outputs")

In [3]:
def strip_leading_and_trailing_punctuation(word):
    return word.strip(string.punctuation)

In [4]:
def all_chars_in_word_are_punctuation(word):
    return all(char in PUNCTUATION_SET for char in word)

In [5]:
def get_sentence_length(sentence):
    sentence = sentence.decode('utf-8')
    word_count = 0
    for word in sentence.split():
        if not all_chars_in_word_are_punctuation(word):
            word = strip_leading_and_trailing_punctuation(word)
            if ',' in word:
                words = word.split(',')
                for word in words:
                    if not all_chars_in_word_are_punctuation(word):
                        word_count += 1
            else:
                word_count += 1
    return word_count

In [6]:
get_sentence_length(b'though this is,about gnu enterprise - :) (a project within gnu)')

10

In [9]:
def generate_sentence_word_counts(input_file, output_directory, output_file):
    with gzip.open(input_file, 'r') as chat_file, open(join(output_directory, output_file), 'w') as out_file:
        for i, chat_line in enumerate(chat_file):
            count = get_sentence_length(chat_line)
            out_file.write("{}\n".format(count))
            # if i >= 20:
            #    break

In [10]:
### CAREFUL WITH THIS. Might erase existing data file
generate_sentence_word_counts(CHAT_LOG_FILE, FEATURE_OUTPUT_DIR, "sentence_word_counts.txt")