## Softcosine Similarity

**Purpose**: compute softcosine similarity between outlets' documents
- Code based on https://github.com/damian0604/newsevents/blob/master/src/data-processing/020-softcosine_newsevents.py

In [1]:
# matplotlib is logged even though disable_existing_loggers=yes in logging_config.yaml
# https://stackoverflow.com/a/51529172/7016397
# workaround is to manually set the level before creating my logger
import logging
logging.getLogger('matplotlib').setLevel(logging.WARNING)

from usrightmedia.shared.loggers import get_logger
LOGGER = get_logger(filename = '01-softcosine-similarity', logger_type='main')

In [2]:
from datetime import datetime
import gensim
gensim.__version__

'3.8.1'

In [3]:
import os
from inca import Inca
myinca = Inca()

In [4]:
outlet_doctypes = [
    "americanrenaissance",
    "breitbart",
    "dailycaller",
    "dailystormer",
    "foxnews",
    "gatewaypundit",
    "infowars",
    "newsmax",
    "oneamericanews",
    "rushlimbaugh",
    "seanhannity",
    "vdare",
    "washingtonexaminer",
]

In [5]:
START_TIME = datetime.now()
LOGGER.info(f"Starting softcosine similarity calculation at {START_TIME}")

myinca.analysis.softcosine_similarity.fit(path_to_model=os.path.join("..", "..", "data", "gensim-data", "word2vec-google-news-300", "word2vec-google-news-300.gz"),
                                          source=outlet_doctypes,
                                          target=outlet_doctypes,
                                          sourcetext='article_maintext_4',
                                          sourcedate='publish_date',
                                          targettext='article_maintext_4',
                                          targetdate='publish_date',
                                          keyword_source=None,
                                          keyword_target=None,
                                          keyword_source_must=False,
                                          keyword_target_must=False,
                                          condition_source={'should_include': True},
                                          condition_target={'should_include': True},
                                          days_before=0,
                                          days_after=2,
                                          merge_weekend=False, # do not assume weekend can be collapsed
                                          threshold=0.2,
                                          from_time='2016-01-01', # gte
                                          to_time='2021-01-01', # lte
                                          to_csv=False, # return a pickled pandas dataframe instead of a CSV file
                                          destination=os.path.join("..", "..", "data", "02-intermediate", "06-newsevents", "01-softcosine-output"),
                                          to_pajek=False, # not available in combination with days_before/days_after parameters
                                          filter_above=0.5, # default
                                          filter_below=5) # default

END_TIME = datetime.now()
LOGGER.info(f"Finished softcosine similarity calculation at {END_TIME}")

100%|██████████| 173504/173504 [01:01<00:00, 2814.38it/s]
100%|██████████| 173504/173504 [01:18<00:00, 2207.15it/s]
1539it [34:07,  1.33s/it]
