In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
# initialize transformer pipeline for SA
from transformers import pipeline

sentiment = pipeline('sentiment-analysis')

Downloading: 100%|███████████████████████████████████████████████████████████████████████████| 230/230 [00:00<00:00, 230kB/s]
Couldn't reach server at 'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-modelcard.json' to download model card file.
Creating an empty model card.


In [3]:
# initiate random seed
np.random.seed(1337)

In [4]:
def remove_multiple_masks(df):
    col = 'too_many_masks'
    df = df[df[col] == False]
    return df.drop(columns=[col])


In [5]:
def clean(df):
    df = remove_multiple_masks(df)
    # add more funcs as we go
    return df

In [6]:
def translate_sentiment(label):
    if 'NEG' in label:
        return -1
    else:
        return 1

def calc_sentiment(row):
    sentobj = sentiment(row['text'])[0]
    row['sentiment_score'] = sentobj['score']
    row['sentiment'] = translate_sentiment(sentobj['label'])
    return row

In [7]:
def make_path(root, folder):
    path = os.path.join(root, folder)
    if not os.path.exists(path):
        os.mkdir(path)
    return path

In [8]:
main_folder = os.path.dirname(os.getcwd())
data_path = os.path.join(main_folder, "graphql", "downloads")
data_path

'C:\\Users\\Tollef\\Documents\\GitHub\\masterNEW\\REPO\\graphql\\downloads'

In [9]:
downloads = make_path(os.getcwd(), "graphql-topic-downloads")
downloads

'C:\\Users\\Tollef\\Documents\\GitHub\\masterNEW\\REPO\\dataset\\graphql-topic-downloads'

In [12]:
all_topics_entity_dfs = []
all_topics_texts_dfs = []

for subdir, dirs, files in os.walk(data_path):
    folder_name = os.path.split(subdir)[-1] if not dirs else None
    if not folder_name:
        continue
    entity_path = os.path.join(subdir, "entities.csv")
    text_path = os.path.join(subdir, "texts.csv")

    entity_df = pd.read_csv(entity_path)
    text_df = pd.read_csv(text_path)

    # clean the entity file
    entity_df = clean(entity_df)
    
    # remove columns not needed for annotation
    # entity_df = entity_df.drop(columns=['entity_id', 'extractors', 'score', 'score_within_variance'])
    # add columns to ready for prior sentiment analysis
    entity_df['sentiment'] = np.nan
    entity_df['sentiment_score'] = np.nan
    # apply sentiment analysis from huggingface transformers distilbert
    entity_df = entity_df.apply(lambda row: calc_sentiment(row), axis=1)

    # store at a new location, within the dataset folder:      
    dataset_loc = os.path.join(downloads, folder_name)
    print(dataset_loc)

    entity_df.to_csv(dataset_loc + "-entities.csv")
    
    all_topics_entity_dfs.append(entity_df)
    all_topics_texts_dfs.append(text_df)
    
all_entities = pd.concat(all_topics_entity_dfs)
all_texts = pd.concat(all_topics_texts_dfs)

print('Storing files in {}'.format(downloads))
all_entities.to_csv(os.path.join(downloads, "all_entities.csv"))
all_texts.to_csv(os.path.join(downloads, "TEXTS_LOOKUP_FILE.csv"))

C:\Users\Tollef\Documents\GitHub\masterNEW\REPO\dataset\graphql-topic-downloads\business
C:\Users\Tollef\Documents\GitHub\masterNEW\REPO\dataset\graphql-topic-downloads\politics
C:\Users\Tollef\Documents\GitHub\masterNEW\REPO\dataset\graphql-topic-downloads\sports
C:\Users\Tollef\Documents\GitHub\masterNEW\REPO\dataset\graphql-topic-downloads\tech
Storing files in C:\Users\Tollef\Documents\GitHub\masterNEW\REPO\dataset\graphql-topic-downloads
