In [4]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import scipy
import os, sys

sys.path = list(set(sys.path + [ '../common' ]))

import utility
import text_corpus
import corpus_vectorizer
import types

# df = pd.read_excel('./data/year+text_window.xlsx')
# df.to_csv('./data/year+text_window.txt', sep='\t')



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
class DataFrameReader:

    def __init__(self, df, year=None):
        
        self.df = df
        
        if year is not None:
            self.df = self.df[self.df.year == year]
                              
        if len(self.df[self.df.txt.isna()]) > 0:
            print('Warn: {} n/a rows encountered'.format(len(self.df[self.df.txt.isna()])))
            self.df = self.df.dropna()
            
        self.iterator = None
        self.metadata = [ types.SimpleNamespace(filename=str(i), year=r) for i, r in enumerate(self.df.year.values)]
        self.metadict = { x.filename: x for x in (self.metadata or [])}
        self.filenames = [ x.filename for x in self.metadata ]
        
    def __iter__(self):
                              
        self.iterator = None
        return self

    def __next__(self):
                              
        if self.iterator is None:
            self.iterator = self.get_iterator()
                              
        return next(self.iterator)

    def get_iterator(self):
        return ((str(i), x) for i,x in enumerate(self.df.txt))

In [None]:
import numpy as np

def compute_coocurrence_matrix(reader, **kwargs):

    corpus = text_corpus.ProcessedCorpus(reader, isalnum=False, **kwargs)
    vectorizer = corpus_vectorizer.CorpusVectorizer(lowercase=False)
    vectorizer.fit_transform(corpus)
        
    term_term_matrix = np.dot(vectorizer.X.T, vectorizer.X)
        
    term_term_matrix = scipy.sparse.triu(term_term_matrix, 1)
        
    coo = term_term_matrix
    id2token = { i: t for t,i in vectorizer.vocabulary.items()}
    cdf = pd.DataFrame({
        'w1_id': coo.row,
        'w2_id': coo.col,
        'value': coo.data
    })[['w1_id', 'w2_id', 'value']].sort_values(['w1_id', 'w2_id'])\
        .reset_index(drop=True)
    cdf['w1'] = cdf.w1_id.apply(lambda x: id2token[x])
    cdf['w2'] = cdf.w2_id.apply(lambda x: id2token[x])
    
    return cdf[['w1', 'w2', 'value']]

def compute_co_ocurrence_for_year(source_filename, year, result_filename):
    
    df = pd.read_csv(source_filename, sep='\t')[['year', 'txt']]

    reader = DataFrameReader(df, year)

    kwargs = dict(to_lower=True, deacc=False, min_len=1, max_len=None, numerals=False)

    coo_df = compute_coocurrence_matrix(reader, **kwargs)
    coo_df.to_excel(result_filename)

compute_co_ocurrence_for_year('./data/year+text_window.txt', 1957, 'test_1957.xlsx')



Warn: 40 n/a rows encountered


In [33]:
import unittest
import corpus_vectorizer

class Test_DfReader(unittest.TestCase):
    
    def setUp(self):
        pass
    
    def create_test_dataframe(self):
        data = [ 
            (2000, 'A B C'),
            (2000, 'B C D'), 
            (2001, 'C B'),
            (2003, 'A B F'),
            (2003, 'E B'),
            (2003, 'F E E')
        ]
        df = pd.DataFrame(data, columns=['year', 'txt'])
        return df
    
    def test_reader_with_all_documents(self):
        df = self.create_test_dataframe()
        reader = DataFrameReader(df)
        result = [x for x in reader]
        expected = [('0', 'A B C'), ('1', 'B C D'), ('2', 'C B'), ('3', 'A B F'), ('4', 'E B'), ('5', 'F E E')]
        self.assertEqual(expected, result)
        self.assertEqual(['0', '1', '2', '3', '4', '5'], reader.filenames)
        self.assertEqual([
                types.SimpleNamespace(filename='0', year=2000),
                types.SimpleNamespace(filename='1', year=2000),
                types.SimpleNamespace(filename='2', year=2001),
                types.SimpleNamespace(filename='3', year=2003),
                types.SimpleNamespace(filename='4', year=2003),
                types.SimpleNamespace(filename='5', year=2003)
            ], reader.metadata
        )
        
    def test_reader_with_given_year(self):
        df = self.create_test_dataframe()
        reader = DataFrameReader(df, 2003)
        result = [x for x in reader]
        expected = [('0', 'A B F'), ('1', 'E B'), ('2', 'F E E')]
        self.assertEqual(expected, result)
        self.assertEqual(['0', '1', '2'], reader.filenames)
        self.assertEqual([
                types.SimpleNamespace(filename='0', year=2003),
                types.SimpleNamespace(filename='1', year=2003),
                types.SimpleNamespace(filename='2', year=2003)
            ], reader.metadata
        )

class Test_DfVectorize(unittest.TestCase):
    
    def setUp(self):
        pass
    
    def create_test_dataframe(self):
        data = [ 
            (2000, 'A B C'),
            (2000, 'B C D'), 
            (2001, 'C B'),
            (2003, 'A B F'),
            (2003, 'E B'),
            (2003, 'F E E')
        ]
        df = pd.DataFrame(data, columns=['year', 'txt'])
        return df
    
    def create_corpus(self):
        df = self.create_test_dataframe()
        reader = DataFrameReader(df)
        kwargs = dict(isalnum=False, to_lower=False, deacc=False, min_len=0, max_len=None, numerals=False)
        corpus = text_corpus.ProcessedCorpus(reader, **kwargs)
        return corpus
    
    def test_corpus_text_stream(self):
        df = self.create_test_dataframe()
        reader = DataFrameReader(df)
        corpus = text_corpus.CorpusTextStream(reader)
        result = [ x for x in corpus.documents()]
        expected = [('0', 'A B C'), ('1', 'B C D'), ('2', 'C B'), ('3', 'A B F'), ('4', 'E B'), ('5', 'F E E')]
        self.assertEqual(expected, result)
        
    def test_corpus_token_stream(self):
        df = self.create_test_dataframe()
        reader = DataFrameReader(df)
        corpus = text_corpus.CorpusTokenStream(reader)
        result = [ x for x in corpus.documents()]
        expected = [('0', ['A', 'B', 'C']), ('1', ['B', 'C', 'D']), ('2', ['C', 'B']), ('3', ['A', 'B', 'F']), ('4', ['E', 'B']), ('5', ['F', 'E', 'E'])]
        self.assertEqual(expected, result)

    def test_processed_corpus_token_stream(self):
        df = self.create_test_dataframe()
        reader = DataFrameReader(df)
        kwargs = dict(isalnum=False, to_lower=False, deacc=False, min_len=0, max_len=None, numerals=False)
        corpus = text_corpus.ProcessedCorpus(reader, **kwargs)
        result = [ x for x in corpus.documents()]
        expected = [('0', ['A', 'B', 'C']), ('1', ['B', 'C', 'D']), ('2', ['C', 'B']), ('3', ['A', 'B', 'F']), ('4', ['E', 'B']), ('5', ['F', 'E', 'E'])]
        self.assertEqual(expected, result)
        
    def test_fit_transform_gives_document_term_matrix(self):
        reader = DataFrameReader(self.create_test_dataframe())
        kwargs = dict(to_lower=False, deacc=False, min_len=1, max_len=None, numerals=False)
        corpus = text_corpus.ProcessedCorpus(reader, isalnum=False, **kwargs)
        vectorizer = corpus_vectorizer.CorpusVectorizer(lowercase=False)
        vectorizer.fit_transform(corpus)
        expected = np.asarray([
            [1, 1, 1, 0, 0, 0],
            [0, 1, 1, 1, 0, 0],
            [0, 1, 1, 0, 0, 0],
            [1, 1, 0, 0, 0, 1],
            [0, 1, 0, 0, 1, 0],
            [0, 0, 0, 0, 2, 1]
        ])
        self.assertTrue((expected == vectorizer.X).all())
        results = vectorizer.vocabulary
        expected = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'F': 5, 'E': 4 }
        self.assertEqual(expected, results)

    def test_AxAt_of_document_term_matrix_gives_term_term_matrix(self):
        reader = DataFrameReader(self.create_test_dataframe())
        kwargs = dict(to_lower=False, deacc=False, min_len=1, max_len=None, numerals=False)
        corpus = text_corpus.ProcessedCorpus(reader, isalnum=False, **kwargs)
        vectorizer = corpus_vectorizer.CorpusVectorizer(lowercase=False)
        vectorizer.fit_transform(corpus)
        
        term_term_matrix = np.dot(vectorizer.X.T, vectorizer.X)
        
        #print(term_term_matrix.toarray())
        
        expected = np.asarray([
            [2, 2, 1, 0, 0, 1],
            [2, 5, 3, 1, 1, 1],
            [1, 3, 3, 1, 0, 0],
            [0, 1, 1, 1, 0, 0],
            [0, 1, 0, 0, 5, 2],
            [1, 1, 0, 0, 2, 2]
        ])
        self.assertTrue((expected == term_term_matrix).all())
        
        term_term_matrix = scipy.sparse.triu(term_term_matrix, 1)
        
        #print(term_term_matrix.todense())
        #print(term_term_matrix)
        coo = term_term_matrix
        id2token = { i: t for t,i in vectorizer.vocabulary.items()}
        cdf = pd.DataFrame({
            'w1_id': coo.row,
            'w2_id': coo.col,
            'value': coo.data
        })[['w1_id', 'w2_id', 'value']].sort_values(['w1_id', 'w2_id'])\
            .reset_index(drop=True)
        cdf['w1'] = cdf.w1_id.apply(lambda x: id2token[x])
        cdf['w2'] = cdf.w2_id.apply(lambda x: id2token[x])
        print(cdf[['w1', 'w2', 'value']])
        
unittest.main(argv=['first-arg-is-ignored'], exit=False)
        

.......

  w1 w2  value
0  A  B      2
1  A  C      1
2  A  F      1
3  B  C      3
4  B  D      1
5  B  E      1
6  B  F      1
7  C  D      1
8  E  F      2



----------------------------------------------------------------------
Ran 7 tests in 0.063s

OK


<unittest.main.TestProgram at 0x7f2b4d58db00>