# Imports and data

In [1]:
import pandas as pd
import time
import re 
import progressbar as pb

import jieba
from collections import Counter
import string
from jieba import posseg as pseg
import matplotlib

import numpy as np

df = pd.read_excel("Dataset for Table_1_2, Fig_4.xlsx")

all_visible = df.loc[df["fulltext_released_to_public"] == True]
all_invisible = df.loc[df["fulltext_released_to_public"] == False]

visible_list = all_visible["title"].values.tolist()
invisible_list = all_invisible["title"].values.tolist()

# Text cleaning

In [2]:
def strip_text(text):
    """
    strips a Chinese text to only the desired features
    
    args:
        text: list of strings to clean
    
    returns:
        text: list of strings that have been cleaned
    
    """
    sentence = [re.sub ("\/", "_", str (item)) for item in pseg.cut(text)] 
    sentence = [word for word in sentence if not re.search ("_ns|_x|_m", word)] #exclude place names, non-morphemes, measures 
    sentence = [word for word in sentence if len (re.sub ("_.*", "", word))>1] #restrict to two-character words
    sentence = [word for word in sentence if re.search ("_n|_v|_j", word)] #restrict to nouns, verbs, adjectives,  
    return ' '.join(list(re.sub ("_.*", "", word) for word in sentence))
    #return (list(re.sub ("_.*", "", word) for word in sentence))#delete POS tags

v = [strip_text(x) for x in visible_list]
inv = [strip_text(x) for x in invisible_list]

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\vbrus\AppData\Local\Temp\jieba.cache
Loading model cost 0.624 seconds.
Prefix dict has been built successfully.


# Fightin' Words algorithm

In [3]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer as CV
import string

def bayes_compare_language(l1, l2, ngram = 1, prior=.01, cv = None):
    '''
    args:
      l1, l2; a list of strings from each language sample
      ngram; an int describing up to what n gram you want to consider (1 is unigrams,
    2 is bigrams + unigrams, etc). Ignored if a custom CountVectorizer is passed.
      prior; either a float describing a uniform prior, or a vector describing a prior
    over vocabulary items. If you're using a predefined vocabulary, make sure to specify that
    when you make your CountVectorizer object.
      cv; a sklearn.feature_extraction.text.CountVectorizer object, if desired.

    returns:
      A list of length |Vocab| where each entry is a (n-gram, zscore) tuple.
    '''
    
    if cv is None and type(prior) is not float:
        print("If using a non-uniform prior:")
        print("Please also pass a count vectorizer with the vocabulary parameter set.")
        quit()
    if cv is None:
        cv = CV(decode_error = 'ignore', min_df = 10, max_df = .5, ngram_range=(1,ngram),
                binary = False,
                max_features = 15000)
    counts_mat = cv.fit_transform(l1+l2).toarray()
    # Now sum over languages...
    vocab_size = len(cv.vocabulary_)
    print("Vocab size is {}".format(vocab_size))
    if type(prior) is float:
        priors = np.array([prior for i in range(vocab_size)])
    else:
        priors = prior
    z_scores = np.empty(priors.shape[0])
    count_matrix = np.empty([2, vocab_size], dtype=np.float32)
    count_matrix[0, :] = np.sum(counts_mat[:len(l1), :], axis = 0)
    count_matrix[1, :] = np.sum(counts_mat[len(l1):, :], axis = 0)
    a0 = np.sum(priors)
    n1 = 1.*np.sum(count_matrix[0,:])
    n2 = 1.*np.sum(count_matrix[1,:])
    print("Comparing language...")
    for i in range(vocab_size):
        #compute delta
        term1 = np.log((count_matrix[0,i] + priors[i])/(n1 + a0 - count_matrix[0,i] - priors[i]))
        term2 = np.log((count_matrix[1,i] + priors[i])/(n2 + a0 - count_matrix[1,i] - priors[i]))        
        delta = term1 - term2
        #compute variance on delta
        var = 1./(count_matrix[0,i] + priors[i]) + 1./(count_matrix[1,i] + priors[i])
        #store final score
        z_scores[i] = delta/np.sqrt(var)
    index_to_term = {v:k for k,v in cv.vocabulary_.items()}
    sorted_indices = np.argsort(z_scores)
    return_list = []
    for i in sorted_indices:
        return_list.append((index_to_term[i], z_scores[i]))
    return return_list

results = pd.DataFrame(bayes_compare_language(v, inv), columns=["Word", "Score"])
results.to_excel(".//results.xlsx")

Vocab size is 4799
Comparing language...
