# Features CSV

This Notebook concatenates all the generated feature files into a single CSV that can be trained. The columns to be included are:

- Sentence Vector
- Absolute Sentence Position
- Sentence Length
- Number of Special Terms
- Sentiment Score
- Mean TF-IDF
- Normalized Mean TF-IDF
- Mean TF-ISF
- Normalized Mean TF-ISF

All the files provided should contain equal number of line numbers ie. data records. As for the current GNUe chat dataset, that is `659165` records.

In [10]:
import csv
import os

from os.path import join


In [3]:
def load_selected_chats(input_filename):
    selected_chats = []
    with open(input_filename) as input_file:
        for chat_id in input_file:
            selected_chats.append(int(chat_id.strip()))
    return selected_chats

In [4]:
len(load_selected_chats(join("..", "data_files", "summarized_chat_log_ids.csv")))

20715

In [5]:
def create_features_csv(
    features_dir,
    features_csv_filename,
    absolute_sentence_positions_filename,
    sentence_length_filename,
    number_of_special_terms_filename,
    sentiment_score_filename,
    mean_tf_idf_filename,
    normalized_mean_tf_idf_filename,
    mean_tf_isf_filename,
    normalized_mean_tf_isf_filename,
    sentence_vector_filename=None,
    selected_chat_logs=None
):
    current_log_id = None
    file_line_number = 0
    selected_chat_logs_index = 0
    if selected_chat_logs:
        current_log_id = selected_chat_logs[selected_chat_logs_index]
        print("Current Top: ", current_log_id)
        
    with open(
        join(features_dir, features_csv_filename), "w") as features_csv_file, open(
        join(features_dir, absolute_sentence_positions_filename)) as absolute_sentence_positions_file, open(
        join(features_dir, sentence_length_filename)) as sentence_length_file, open(
        join(features_dir, number_of_special_terms_filename)) as number_of_special_terms_file, open(
        join(features_dir, sentiment_score_filename)) as sentiment_score_file, open(
        join(features_dir, mean_tf_idf_filename)) as mean_tf_idf_file, open(
        join(features_dir, normalized_mean_tf_idf_filename)) as normalized_mean_tf_idf_file, open(
        join(features_dir, mean_tf_isf_filename)) as mean_tf_isf_file, open(
        join(features_dir, normalized_mean_tf_isf_filename)) as normalized_mean_tf_isf_file:
        
        file_line_number += 1
        # Read first lines of each file
        absolute_sentence_position = absolute_sentence_positions_file.readline().strip()
        sentence_length = sentence_length_file.readline().strip()
        number_of_special_terms = number_of_special_terms_file.readline().strip()
        sentiment_score = sentiment_score_file.readline().strip().split()[0]
        mean_tf_idf = mean_tf_idf_file.readline().strip()
        normalized_mean_tf_idf = normalized_mean_tf_idf_file.readline().strip()
        mean_tf_isf = mean_tf_isf_file.readline().strip()
        normalized_mean_tf_isf = normalized_mean_tf_isf_file.readline().strip()
        # sentence_vector = sentence_vector_file.readline()      
            
        features_csv = csv.writer(features_csv_file, delimiter=',')
        features_csv.writerow([
            "absolute_sentence_position",
            "sentence_length",
            "number_of_special_terms",
            "sentiment_score",
            "mean_tf_idf",
            "normalized_mean_tf_idf",
            "mean_tf_isf",
            "normalized_mean_tf_isf"
            #"sentence_vector"
        ])
        while absolute_sentence_position:
            
            if current_log_id:
                if current_log_id != file_line_number:
                    # Read next lines of each file
                    absolute_sentence_position = absolute_sentence_positions_file.readline().strip()
                    sentence_length = sentence_length_file.readline().strip()
                    number_of_special_terms = number_of_special_terms_file.readline().strip()
                    sentiment_score = sentiment_score_file.readline().strip().split()
                    if sentiment_score:
                        sentiment_score = sentiment_score[0]
                    mean_tf_idf = mean_tf_idf_file.readline().strip()
                    normalized_mean_tf_idf = normalized_mean_tf_idf_file.readline().strip()
                    mean_tf_isf = mean_tf_isf_file.readline().strip()
                    normalized_mean_tf_isf = normalized_mean_tf_isf_file.readline().strip()
                    # sentence_vector = sentence_vector_file.readline() 
                    file_line_number += 1
                    continue          
            
            features_csv.writerow([
                absolute_sentence_position,
                sentence_length,
                number_of_special_terms,
                sentiment_score,
                mean_tf_idf,
                normalized_mean_tf_idf,
                mean_tf_isf,
                normalized_mean_tf_isf,
                #sentence_vector,
            ])
            # Read next lines of each file
            absolute_sentence_position = absolute_sentence_positions_file.readline().strip()
            sentence_length = sentence_length_file.readline().strip()
            number_of_special_terms = number_of_special_terms_file.readline().strip()
            sentiment_score = sentiment_score_file.readline().strip().split()
            if sentiment_score:
                sentiment_score = sentiment_score[0]
            mean_tf_idf = mean_tf_idf_file.readline().strip()
            normalized_mean_tf_idf = normalized_mean_tf_idf_file.readline().strip()
            mean_tf_isf = mean_tf_isf_file.readline().strip()
            normalized_mean_tf_isf = normalized_mean_tf_isf_file.readline().strip()
            # sentence_vector = sentence_vector_file.readline()   
            file_line_number += 1
            if selected_chat_logs:
                selected_chat_logs_index += 1
                if selected_chat_logs_index >= len(selected_chat_logs):
                    break
                current_log_id = selected_chat_logs[selected_chat_logs_index]
        if selected_chat_logs:
            assert selected_chat_logs_index == len(selected_chat_logs)
            print("Final index: ", selected_chat_logs_index)
        print("Last File Number: ", file_line_number)

In [6]:
FEATURES_DIR = join("..", "feature_outputs")
features_csv_filename = "features.csv" 
summarized_chats_features_csv_filename = "summarized_chats_features.csv" 
absolute_sentence_positions_filename = "absolute_sentence_positions.txt" 
sentence_length_filename = "sentence_word_counts.txt" 
number_of_special_terms_filename = "special_terms_count.txt" 
sentiment_score_filename = "sentence_sentiments.txt" 
mean_tf_idf_filename = "chats_mean_tf_idf.txt" 
normalized_mean_tf_idf_filename = "normalized_chats_mean_tf_idf.txt" 
mean_tf_isf_filename = "chats_mean_tf_isf.txt" 
normalized_mean_tf_isf_filename = "normalized_chats_mean_tf_isf.txt" 
selected_chat_logs = load_selected_chats(join("..", "data_files", "summarized_chat_log_ids.csv"))
# sentence_vector_filename = "sentence_vector.txt" 

# create_features_csv(
#     FEATURES_DIR,
#     features_csv_filename,
#     absolute_sentence_positions_filename,
#     sentence_length_filename,
#     number_of_special_terms_filename,
#     sentiment_score_filename,
#     mean_tf_idf_filename,
#     normalized_mean_tf_idf_filename,
#     mean_tf_isf_filename,
#     normalized_mean_tf_isf_filename,
#     None,
#     None
# )

In [12]:
def count_lines_in_file(input_filename):
    line_count = 0
    with open(input_filename) as input_file:
        for line in input_file:
            line_count += 1
    return line_count

In [None]:
FEATURES_DIR = join("..", "feature_outputs")
sentence_embeddings_filename = join(FEATURES_DIR, "sentence_embeddings_2.csv")
count_lines_in_file(sentence_embeddings_filename)

In [11]:
stats = os.stat(sentence_embeddings_filename)
stats

os.stat_result(st_mode=33188, st_ino=19478805, st_dev=16777221, st_nlink=1, st_uid=501, st_gid=20, st_size=21151757222, st_atime=1550506379, st_mtime=1550493216, st_ctime=1550494480)