In [1]:
import sys
from pathlib import Path
sys.path.append(str(Path.cwd().parent))

import numpy as np
import pandas as pd

from models.layers import ProcessText
from utils import DATA_COLS, RAW_DATA_DIR, DATA_DIR

In [2]:
# Amazon, IMDB, Yelp Datsets
MINI_OUT          = DATA_DIR / "mini-data.csv"
MINI_STOPLESS_OUT = DATA_DIR / "mini-stopless-data.csv"
# Twitter Dataset
MAIN_OUT          = DATA_DIR / "main-data.csv"
MAIN_STOPLESS_OUT = DATA_DIR / "main-stopless-data.csv"

MAKE_MINI = True
MAKE_MINI_STOPLESS = True

MAKE_MAIN = True
MAKE_MAIN_STOPLESS = True

In [3]:
np.random.seed(0)

In [4]:
def save_df(df, path):
    df.to_csv(path, header=False, index=False)
    
def apply_df_text(df, func, *args, **kwargs):
    df["text"] = df["text"].apply(func, args=args, **kwargs)

In [5]:
def make_mini_df():
    """ """
    df = pd.DataFrame()
    for name in ["amazon", "imdb", "yelp"]:
        path = RAW_DATA_DIR / f"{name}_labelled.txt"
        csv_df = pd.read_csv(path, sep="\t", names=["text", "sentiment"])
        df = pd.concat([df, csv_df])
    
    df = df.drop_duplicates(subset="text")
    
    return df

def make_twitter_df():
    """ """
    encoding = "ISO-8859-1"
    cols = ["sentiment", "id", "date", "flag", "user", "text"]
    cols_to_drop = ["id", "date", "flag", "user"]

    # Removes unnecessary columns from csv file and re-orders columns to match the previous DataFrame
    df = pd.read_csv(RAW_DATA_DIR / "twitter-tweets.csv", encoding=encoding,  names=cols)
    df = df.drop(columns=cols_to_drop)
    df = df.reindex(columns=DATA_COLS)
    # Sets values of positive rows to 1 since they're listed as 4 for some reason
    df["sentiment"] = df["sentiment"].replace(4,1)
    
    df = df.drop_duplicates(subset="text")
    
    return df

In [6]:
def process_df(df, save_path=None, **kwargs):
    """ """
    df.text = ProcessText(**kwargs)(df.text.values).numpy()
    df.text = df.text.apply(lambda i: i.decode())
    
    if save_path is not None:
        df.to_csv(save_path, header=False, index=False)
        
    return df

In [7]:
if MAKE_MINI or MAKE_MINI_STOPLESS:
    df = make_mini_df()
    
    if MAKE_MINI:
        df = process_df(df, MINI_OUT, stopwords=False)
        print(f"\n{MINI_OUT} saved")
        print(df.info())
        
    if MAKE_MINI_STOPLESS:
        processor_opts = { "stopwords": False, "ignore": True } if MAKE_MINI else {}
        df = process_df(df, MINI_STOPLESS_OUT, **processor_opts)
        print(f"\n{MINI_STOPLESS_OUT} saved")
        print(df.info())


/Users/Taennan/Projects/Web/Apps/sass-bot/back/src/ml/data/processed/mini-data.csv saved
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2731 entries, 0 to 999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       2731 non-null   object
 1   sentiment  2731 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 64.0+ KB
None

/Users/Taennan/Projects/Web/Apps/sass-bot/back/src/ml/data/processed/mini-stopless-data.csv saved
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2731 entries, 0 to 999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       2731 non-null   object
 1   sentiment  2731 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 64.0+ KB
None


In [8]:
if MAKE_MAIN or MAKE_MAIN_STOPLESS:
    df = make_twitter_df()
    if MAKE_MAIN:
        df = process_df(df, MAIN_OUT, stopwords=False)
        print(f"\n{MAIN_OUT} saved")
        print(df.info())
    if MAKE_MAIN_STOPLESS:
        processor_opts = { "stopwords": False, "ignore": True } if MAKE_MAIN else {}
        df = process_df(df, MAIN_STOPLESS_OUT, **processor_opts)
        print(f"\n{MAIN_STOPLESS_OUT} saved")
        print(df.info())


/Users/Taennan/Projects/Web/Apps/sass-bot/back/src/ml/data/processed/main-data.csv saved
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1581466 entries, 0 to 1599999
Data columns (total 2 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   text       1581466 non-null  object
 1   sentiment  1581466 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 36.2+ MB
None

/Users/Taennan/Projects/Web/Apps/sass-bot/back/src/ml/data/processed/main-stopless-data.csv saved
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1581466 entries, 0 to 1599999
Data columns (total 2 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   text       1581466 non-null  object
 1   sentiment  1581466 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 36.2+ MB
None
