# Features CSV

This Notebook concatenates all the generated feature files into a single CSV that can be trained. The columns to be included are:

- Sentence Vector
- Absolute Sentence Position
- Sentence Length
- Number of Special Terms
- Sentiment Score
- Mean TF-IDF
- Normalized Mean TF-IDF
- Mean TF-ISF
- Normalized Mean TF-ISF

All the files provided should contain equal number of line numbers ie. data records. As for the current GNUe chat dataset, that is `659165` records.

In [1]:
import csv
import gzip
import os

from os.path import join

In [2]:
def load_selected_chats(input_filename):
    selected_chats = []
    with open(input_filename) as input_file:
        for chat_id in input_file:
            log_id, is_summary = chat_id.strip().split(",")
            selected_chats.append((int(log_id), int(is_summary)))
    return selected_chats

In [5]:
load_selected_chats(join("..", "data_files", "summarized_chat_log_ids.csv"))[:5]

[(85350, 0), (85351, 0), (85352, 0), (85353, 0), (85354, 0)]

In [6]:
len(load_selected_chats(join("..", "data_files", "summarized_chat_log_ids.csv")))

20715

In [10]:
def create_features_csv(
    features_dir,
    features_csv_filename,
    absolute_sentence_positions_filename,
    sentence_length_filename,
    number_of_special_terms_filename,
    sentiment_score_filename,
    mean_tf_idf_filename,
    normalized_mean_tf_idf_filename,
    mean_tf_isf_filename,
    normalized_mean_tf_isf_filename,
    sentence_embeddings_filename=None,
    selected_chat_logs=None
):
    current_log_id = None
    file_line_number = 0
    selected_chat_logs_index = 0
    fake_embeddings_filename = "fake_embeddings.txt"
    if selected_chat_logs:
        current_log_id, is_summary = selected_chat_logs[selected_chat_logs_index]
        print("Current Top: ", current_log_id)
    
    if not sentence_embeddings_filename:
        sentence_embeddings_filename = fake_embeddings_filename
        # Create fake file
        with open(fake_embeddings_filename, "w"):
            pass
    else:
        sentence_embeddings_filename = join(features_dir,sentence_embeddings_filename)
    
    with open(
        join(features_dir, features_csv_filename), "w") as features_csv_file, open(
        join(features_dir, absolute_sentence_positions_filename)) as absolute_sentence_positions_file, open(
        join(features_dir, sentence_length_filename)) as sentence_length_file, open(
        join(features_dir, number_of_special_terms_filename)) as number_of_special_terms_file, open(
        join(features_dir, sentiment_score_filename)) as sentiment_score_file, open(
        join(features_dir, mean_tf_idf_filename)) as mean_tf_idf_file, open(
        join(features_dir, normalized_mean_tf_idf_filename)) as normalized_mean_tf_idf_file, open(
        join(features_dir, mean_tf_isf_filename)) as mean_tf_isf_file, open(
        join(features_dir, normalized_mean_tf_isf_filename)) as normalized_mean_tf_isf_file, gzip.open(
        join(sentence_embeddings_filename)) as sentence_embeddings_file:
        
        file_line_number += 1
        # Read first lines of each file
        absolute_sentence_position = absolute_sentence_positions_file.readline().strip()
        sentence_length = sentence_length_file.readline().strip()
        number_of_special_terms = number_of_special_terms_file.readline().strip()
        sentiment_score = sentiment_score_file.readline().strip().split()[0]
        mean_tf_idf = mean_tf_idf_file.readline().strip()
        normalized_mean_tf_idf = normalized_mean_tf_idf_file.readline().strip()
        mean_tf_isf = mean_tf_isf_file.readline().strip()
        normalized_mean_tf_isf = normalized_mean_tf_isf_file.readline().strip()
        if sentence_embeddings_filename:
            sentence_vector = sentence_embeddings_file.readline().strip()
        else:
            sentence_vector = 0.0
            
        features_csv = csv.writer(features_csv_file, delimiter=',')
        features_csv.writerow([
            "log_id",
            # "sentence_vector",
            "absolute_sentence_position",
            "sentence_length",
            "number_of_special_terms",
            "sentiment_score",
            "mean_tf_idf",
            "normalized_mean_tf_idf",
            "mean_tf_isf",
            "normalized_mean_tf_isf",
            "is_summary"
        ])
        while absolute_sentence_position:
            
            if selected_chat_logs:
                if current_log_id != file_line_number:
                    # Read next lines of each file
                    absolute_sentence_position = absolute_sentence_positions_file.readline().strip()
                    sentence_length = sentence_length_file.readline().strip()
                    number_of_special_terms = number_of_special_terms_file.readline().strip()
                    sentiment_score = sentiment_score_file.readline().strip().split()
                    if sentiment_score:
                        sentiment_score = sentiment_score[0]
                    mean_tf_idf = mean_tf_idf_file.readline().strip()
                    normalized_mean_tf_idf = normalized_mean_tf_idf_file.readline().strip()
                    mean_tf_isf = mean_tf_isf_file.readline().strip()
                    normalized_mean_tf_isf = normalized_mean_tf_isf_file.readline().strip()
            
                    if sentence_embeddings_filename:
                        sentence_vector = sentence_embeddings_file.readline().strip() 
                    file_line_number += 1
                    continue 
                    
            if not selected_chat_logs:
                current_log_id = file_line_number
                is_summary="NULL"
            
            features_csv.writerow([
                current_log_id,
                #sentence_vector,
                absolute_sentence_position,
                sentence_length,
                number_of_special_terms,
                sentiment_score,
                mean_tf_idf,
                normalized_mean_tf_idf,
                mean_tf_isf,
                normalized_mean_tf_isf,
                is_summary
            ])
            
            # Read next lines of each file
            absolute_sentence_position = absolute_sentence_positions_file.readline().strip()
            sentence_length = sentence_length_file.readline().strip()
            number_of_special_terms = number_of_special_terms_file.readline().strip()
            sentiment_score = sentiment_score_file.readline().strip().split()
            if sentiment_score:
                sentiment_score = sentiment_score[0]
            mean_tf_idf = mean_tf_idf_file.readline().strip()
            normalized_mean_tf_idf = normalized_mean_tf_idf_file.readline().strip()
            mean_tf_isf = mean_tf_isf_file.readline().strip()
            normalized_mean_tf_isf = normalized_mean_tf_isf_file.readline().strip()
            
            # if sentence_embeddings_filename:
            #   sentence_vector = sentence_embeddings_file.readline().strip()  
                
            file_line_number += 1
            if selected_chat_logs:
                selected_chat_logs_index += 1
                if selected_chat_logs_index >= len(selected_chat_logs):
                    break
                current_log_id, is_summary = selected_chat_logs[selected_chat_logs_index]
                
        if selected_chat_logs:
            assert selected_chat_logs_index == len(selected_chat_logs)
            print("Final index: ", selected_chat_logs_index)
        print("Last File Number: ", file_line_number)
        
        if not sentence_embeddings_filename:
           os.remove(fake_embeddings_filename)

In [11]:
FEATURES_DIR = join("..", "feature_outputs")
features_csv_filename = "all_chat_features.csv" 
summarized_chats_features_csv_filename = "summarized_chats_features.csv" 
absolute_sentence_positions_filename = "absolute_sentence_positions.txt" 
sentence_length_filename = "sentence_word_counts.txt" 
number_of_special_terms_filename = "special_terms_count.txt" 
sentiment_score_filename = "sentence_sentiments.txt" 
mean_tf_idf_filename = "chats_mean_tf_idf.txt" 
normalized_mean_tf_idf_filename = "normalized_chats_mean_tf_idf.txt" 
mean_tf_isf_filename = "chats_mean_tf_isf.txt" 
normalized_mean_tf_isf_filename = "normalized_chats_mean_tf_isf.txt" 
sentence_embeddings_filename = "sentence_embeddings_2.csv.gz"
selected_chat_logs = load_selected_chats(join("..", "data_files", "summarized_chat_log_ids.csv"))
chat_logs = load_selected_chats(join("..", "data_files", "chat_log_ids.csv"))
 


#selected_chat_logs[0]
# All chats
create_features_csv(
    FEATURES_DIR,
    features_csv_filename,
    absolute_sentence_positions_filename,
    sentence_length_filename,
    number_of_special_terms_filename,
    sentiment_score_filename,
    mean_tf_idf_filename,
    normalized_mean_tf_idf_filename,
    mean_tf_isf_filename,
    normalized_mean_tf_isf_filename,
    None,
    chat_logs
)

# Summary features
# create_features_csv(
#     FEATURES_DIR,
#     "summarized_chats_features_with.csv",
#     absolute_sentence_positions_filename,
#     sentence_length_filename,
#     number_of_special_terms_filename,
#     sentiment_score_filename,
#     mean_tf_idf_filename,
#     normalized_mean_tf_idf_filename,
#     mean_tf_isf_filename,
#     normalized_mean_tf_isf_filename,
#     None,
#     selected_chat_logs
# )

Current Top:  1
Final index:  659165
Last File Number:  659166


In [12]:
def count_lines_in_file(input_filename):
    line_count = 0
    with open(input_filename) as input_file:
        for line in input_file:
            line_count += 1
    return line_count

In [16]:
with gzip.open(join(FEATURES_DIR, "sentence_vectors.txt.gz")) as features_e:
    emb = features_e.readline().strip()
    emb2 = features_e.readline().strip()
emb2

b'1.0222757701660766e-10,-1.9559458929091927e-09,1.3279695939731687e-09,1.2324553052248554e-09,2.482606608290032e-09,-2.74391371849788e-10,-1.0863636265334856e-09,-2.4886456205472817e-10,1.2621625688900456e-09,9.18070533198011e-10,2.4075938601442944e-09,-5.326582242199325e-10,7.117011443837444e-10,2.579528216134441e-09,-7.93907053032071e-10,-1.7025355585994092e-09,5.637443285719513e-10,4.4665024924786264e-10,-2.3499701611548076e-09,-1.4646918318630677e-09,-1.4871635485560617e-09,8.606370103764562e-11,3.736129755012441e-10,-2.514077149090871e-09,-9.642871501648177e-10,-1.0885982765332219e-09,9.025190258043666e-10,1.24999058770125e-09,-6.623168452549785e-10,1.1354552824116734e-09,3.353764061842507e-10,1.9264491487097168e-09,-2.778983400399219e-09,1.9516112582558669e-10,-5.482126810054205e-09,5.117940143867821e-10,1.3941000366499076e-10,5.096144667975348e-10,-2.226912349709594e-09,-4.0765575687121e-10,2.690162376727568e-10,-1.8730766360115173e-10,4.043583102986519e-10,-1.1536109085317949e

In [14]:
FEATURES_DIR = join("..", "feature_outputs")
sentence_embeddings_filename = join(FEATURES_DIR, "summarized_chats_features_with_embeddings.csv")
count_lines_in_file(sentence_embeddings_filename)

20716

In [11]:
stats = os.stat(sentence_embeddings_filename)
stats

os.stat_result(st_mode=33188, st_ino=19478805, st_dev=16777221, st_nlink=1, st_uid=501, st_gid=20, st_size=21151757222, st_atime=1550506379, st_mtime=1550493216, st_ctime=1550494480)