# create training data for community embedding training

In [11]:
import os
import csv
import pandas as pd
import sqlite3


> I used my own code to create the training data, `wv` and `cv` files. The first file (they call `dep.contexts`) is just space delimited subreddit user pairs as you've already generated. The `wv` (word vocab file) is counts of each 'word', so for example if the subreddit `AskReddit` appears 100000 times in `dep.contexts`, it should contain "AskReddit 100000". And the cv file is the same but for contexts (users), so for a user `isaacwaller` that appears 100 times it would contain "isaacwaller 100". In pseudocode:

## load data

In [12]:
ROOT_PATH = os.getcwd()[:-19]
DBPATH = ROOT_PATH + "topic-model-reddit/data/reddit-db-preprocessed.db"
conn = sqlite3.connect(DBPATH)
cur = conn.cursor()

skipped_subr = ["politics", "the_donald", "chapotraphouse", "neoliberal"]
import warnings
warnings.filterwarnings('ignore')

In [15]:
def build_subm_df(year):
    print("  fetching submissions...")
    qr = f"SELECT id, author, subreddit FROM 'submission-{year}' "
    qr += f"WHERE subreddit != '{skipped_subr[0]}' "
    qr += f"AND subreddit != '{skipped_subr[1]}' "
    qr += f"AND subreddit != '{skipped_subr[2]}' "
    qr += f"AND subreddit != '{skipped_subr[3]}' "
    qr += f"AND author != '[deleted]' "
    df = pd.read_sql_query(qr, conn)
    return df

def build_comm_df(year):
    print("  fetching comments...")
    qr = f"SELECT link_id, author, subreddit FROM 'comment-{year}' "
    qr += f"WHERE subreddit != '{skipped_subr[0]}' "
    qr += f"AND subreddit != '{skipped_subr[1]}' "
    qr += f"AND subreddit != '{skipped_subr[2]}' "
    qr += f"AND subreddit != '{skipped_subr[3]}' "
    qr += f"AND author != '[deleted]' "
    df = pd.read_sql_query(qr, conn)
    df["link_id"] = df["link_id"].map(lambda x: x[3:])
    return df

def build_df_year(year):
    print(f"building base dataframe for year {year}")
    subm_df = build_subm_df(year)
    comm_df = build_comm_df(year)
    df_year = pd.concat([subm_df,comm_df])
    return df_year

In [27]:
def create_training_data(df_year):
    
    fpath = ROOT_PATH + "/word"
    
    # dep.contexts
    print("producing dep.contexts...")
    df_2013[["author", "subreddit"]].to_csv("dep.contexts", sep=" ", index=False, header=None)
    
    # context --> user
    print("producing cv.txt...")
    df_2013[["author", "subreddit"]].groupby("author").count().to_csv("cv.txt", sep=" ", header=None)
    
    # word --> subreddit
    print("producing wv.txt...")
    df_2013[["author", "subreddit"]].groupby("subreddit").count().to_csv("wv.txt", sep=" ", header=None)

In [16]:
df_2013 = build_df_year(2013)

building base dataframe for year 2013
  fetching submissions...
  fetching comments...


In [28]:
create_training_data(df_2013)

producing dep.contexts...
producing cv.txt...
producing wv.txt...
