In [22]:
import sys
import re
import os
import random
from math import log, ceil
import numpy as np
import zlib
import bz2

In [23]:


log2p = lambda x: int(ceil(log(x,2)))
MAXPATTERNSIZE = 64		# max length of duplicates

def BackExplore(Str, split, Pattern):
	""" Looks for Pattern before split point, as long as coding for duplication is advantageous
		If found, Pattern is replaced by 
			<relative position in History> <pattern length>
		where position and length are coded in standard binary representation
	"""
	PatternLength = len(Pattern)
	# constraint: log(1+<relative position>) + log(1+PatternLength) < PatternLength
	MaxPast = 2 ** (PatternLength - log2p(1+PatternLength)) - 1		# MaxPast > 0 as soon as PL >= 4
	if MaxPast > 0:
		Best = ' ' * PatternLength	# will store best code for duplicate od Pattern
		for shift in range(min(split, MaxPast)):
			if Str[split-shift-PatternLength:split-shift] == Pattern:
				Code = "%s %s" % (bin(shift)[2:], bin(PatternLength)[2:])
				if len(Code) < len(Best):	Best = Code
		if Best.strip(): return Best
	return None

def Encode(Str):
	" detects duplicates and encodes the string accordingly "
	
	Best = (0, 0, ' ' * MAXPATTERNSIZE)	# will store best duplicate (for the code length difference)
	for split in range(len(Str)):	# splitting the string at all successive positions
		for PatternLength in range(MAXPATTERNSIZE, 2, -1):
			Pattern = Str[split:split+PatternLength]
			ActualPL = len(Pattern)	# may differ from PatternLength
			BE = BackExplore(Str, split, Pattern)
			if BE:
				Gain = ActualPL - len(BE)
				if Gain > 1 and Gain > Best[1] - len(Best[2]):
					Best = (split, ActualPL, BE)
	splt, pl, be = Best
	if be.strip():		# duplicate found
		# recursive calls
		Start = Encode(Str[:splt])
		End = Encode(Str[splt+pl:])
		print('pattern %s of length %d is duplicated at location %d' % (Str[splt:splt+pl], pl, splt))
		return ('%s 1 %s %s' % (Start, be, End)).strip()	# the encoded part is prefixed by 1
	# return '0 ' + Str if Str else ''
	return Str

def Decode(EStr):
	" decodes a string in which duplicates have been encoded "
	Chunks = re.split(r'( 1 \d+ \d+)', EStr)	# splitting code chunks, either <NormalStr> or <1 distanceToPattern patternLength>
	Chunks = map(lambda x: x.strip(), Chunks)	# getting rid of surrounding spaces
	Str = ''
	for Ch in list(Chunks):
		if not Ch.startswith('1 '):	Str += Ch	# normal string
		else:	# expected: <1 distanceToPattern patternLength>
			shift = int(Ch.split()[1], 2)
			patternLength = int(Ch.split()[2], 2)
			SL = len(Str)
			Str += Str[SL-shift-patternLength:SL-shift]
	return Str
	
if __name__ == "__main__":	
	print(__doc__)
	if len(sys.argv) > 1:	TestStr = ''.join(sys.argv[1:])	
	else:
		# TestStr = "001010000111110100110100001111101010010001"
		# TestStr = "001010000111110100110100001111101010010000111000011"
		# TestStr = "0101010101010101010101010101010101"
		# TestStr = "0001010110111001101000010111110011011100110110010001011100110001"
		# TestStr = "1100110100110110111110100001000101011101011101101111101111101111100001100110000111100001100110111000"
		Pi = '1001001000011111101101010100010001000010110100011000010001101001100010011000110011000101000101110000000110111'
		# TestStr = Pi
		TestStr = ''.join([str(random.randint(0,1)) for n in range(100) ])	# random sequence
		print('Usage: %s <binary string with duplicates>\n' % os.path.basename(sys.argv[0]))
	ETestStr = Encode(TestStr)
	print('Original:\t%s - length: %d' % (TestStr, len(TestStr)))
	print('Encoded:\t%s - length: %d' % (ETestStr, len(ETestStr.replace(' ',''))))
	DTestStr = Decode(ETestStr)
	print('Decoded:\t%s - %s' % (DTestStr, 'Correct' if DTestStr == TestStr else 'Incorrect'))

__author__ = 'Dessalles'

Automatically created module for IPython interactive environment
Original:	-f/Users/Marie/Library/Jupyter/runtime/kernel-2478fd2b-eaa9-4449-98ac-f4bc15ee1dac.json - length: 87
Encoded:	-f/Users/Marie/Library/Jupyter/runtime/kernel-2478fd2b-eaa9-4449-98ac-f4bc15ee1dac.json - length: 87
Decoded:	-f/Users/Marie/Library/Jupyter/runtime/kernel-2478fd2b-eaa9-4449-98ac-f4bc15ee1dac.json - Correct


In [24]:
l = Encode('1110011010100111000110011100001101110100100011000101100000100011110000101100111011010011000001100011')

pattern 0011000 of length 7 is duplicated at location 62
pattern 100111000 of length 9 is duplicated at location 20


In [25]:
import zlib

TRIALS = 5000	# number of trials to get average

#*******************************************#
# Biased sequence generation                #
#*******************************************#

def BitGen(Probability1):
	" biased coin flipping "
	return 1 * (random.random() < Probability1)
	
def SeqGen(Length, Probability1):
	" generates a biased random binary string "
	return ''.join([str(BitGen(Probability1)) for n in range(Length)])

def SeqCompress(Sequence):
	if type(Sequence) == str:
		Sequence = Sequence.encode('latin-1')	# 'latin-1' =   8 bits ASCII
	CompressedSize = len(zlib.compress(Sequence))
	CompressingFactor = 100-(100*CompressedSize)/len(Sequence)
	return CompressingFactor

	
if __name__ == "__main__":	
	if len(sys.argv) == 3 and sys.argv[1].isdigit() and sys.argv[2].isdigit():
		SequenceLength = int(sys.argv[1])
		Probability = int(sys.argv[2])
		# Since sequence is coded as an ascii sting, even a random sequence get compressed
		# One must subtract the compression factor of a random sequence
		S1 = SeqGen(int(sys.argv[1]), 1)	# constant sequence of same length
		# Compression factors:
		CF1 = SeqCompress(S1)
		CF = 0
		for n in range(TRIALS):	# averaging over several trials
			S0 = SeqGen(int(sys.argv[1]), 0.5)	# unbiased random sequence of same length
			CF0 = SeqCompress(S0)
			S = SeqGen(SequenceLength, Probability/100.0)
			if n < 10:	print('%s %s' % (S[:40], '...' * (SequenceLength > 40)))
			if n == 10:	print('...')
			CF += SeqCompress(S)
		CF /= TRIALS	# average
		print('Compression = %.01f%%' % (100 * (CF - CF0)/(CF1 - CF0)))
	else:
		print("\tUsage: %s <sequence length> <probability of 1s in %%>" % os.path.basename(sys.argv[0]))
		print(__doc__)
		
		

	Usage: ipykernel_launcher.py <sequence length> <probability of 1s in %>
Automatically created module for IPython interactive environment


In [34]:
s = SeqGen(1000,0.22)
s =b'1'*1000
s

b'11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111

In [35]:
s1 = Encode(s)

pattern b'1111111' of length 7 is duplicated at location 7
pattern b'111111111111111' of length 15 is duplicated at location 15
pattern b'1111111111111111111111111111111' of length 31 is duplicated at location 31
pattern b'1111111' of length 7 is duplicated at location 7
pattern b'111111111111111' of length 15 is duplicated at location 15
pattern b'1111111111111111111111111111111' of length 31 is duplicated at location 31
pattern b'1111111' of length 7 is duplicated at location 7
pattern b'111111111111111' of length 15 is duplicated at location 15
pattern b'1111111111111111111111111111111' of length 31 is duplicated at location 31
pattern b'1111111' of length 7 is duplicated at location 7
pattern b'111111111111111' of length 15 is duplicated at location 15
pattern b'1111111111111111111111111111111' of length 31 is duplicated at location 31
pattern b'1111111' of length 7 is duplicated at location 7
pattern b'111111111111111' of length 15 is duplicated at location 15
pattern b'1111111111

In [36]:
len(s1)/len(s)

0.514

In [None]:
Xabs = np.linspace(0, 0.5,10)
s2 = Encode(SeqGen(1000,Xabs))
plt.plot(Xabs, s2)

In [11]:

#******************************#
# choosing compression method  #
#******************************#
compressor = zlib.compress		# compressor that is used in compression operations
# compressor = bz2.compress		# compressor that is used in compression operations

Bits = ['0', '1']
Bytes = [chr(i) for i in range(256)]		# 256 first chars


#******************************#
# base conversions             #
#******************************#
def BaseChange(Number, BaseSymbols=Bytes):
	""" converts a number from base 10 to another base 
	"""
	Base = len(BaseSymbols)
	result = ''
	Number = int(Number)	# to avoid errors if input is text
	while Number:
		result = BaseSymbols[Number % (Base)] + result	# low-weight digits processed first
		Number //= Base
	return result	# returned as a string

def Base4To16(DNA, Nucleotides='ATGC'):
	"""	converts DNA string with A,T,G,C into base 256
	"""
	Base = list(Nucleotides)
	ByteSequence = ''
	for locus in range(0, len(DNA), 4):
		Hexa = Base.index(DNA[locus])
		Hexa += Base.index(DNA[locus+1])*4
		Hexa += Base.index(DNA[locus+2])*16
		Hexa += Base.index(DNA[locus+3])*64
		ByteSequence += Bytes[Hexa]
	return HexaSequence

	
	
#******************************#
# Compressing sequences        #
#******************************#
	
def Champernowne(N, base=10):
	" computes the Champernowne constant up to N in various bases "
	Champ = ''
	easybases = {2:'b', 8:'o', 10:'', 16:'x'}
	if base in easybases:
		for i in range(N): Champ += ('{0:%s}' % easybases[base]).format(i)
	else:	print('Unsupported base: %d' % base)
	return Champ
	
def compression(TextSample, TextName='', Encoding='latin-1', Verbose=True):
	"""	gets a string compressed and prints compression factor
	"""
	try:	
		if Verbose:	sys.stdout.write("Compressing %s ...\n" % TextSample[:40])
	except UnicodeEncodeError:	pass
	if type(TextSample) == str:
		TextSample = TextSample.encode(Encoding)	# 'latin-1' =   8 bits ASCII
	CompressedSize = len(compressor(TextSample))
	CompressingFactor = 100-(100*CompressedSize)/len(TextSample)
	if Verbose:
		print("%s: size %d reduced to --> %d (compression %.01f%%)" % (TextName, len(TextSample), 
			CompressedSize, CompressingFactor))
	return (CompressedSize, CompressingFactor)
	
  
if __name__ == "__main__":	
	
	##############################
	# Compressing pow(10,i) base 10  #
	##############################
	i = 10000
	print('\nCompressing pow(10,%d) base 10' % i)
	Num = pow(10,i)
	StrNum = str(Num)
	compression(StrNum, "power of ten")

	######################################
	# Compressing pow(256,i) (base 256)  #
	######################################
	i = 10000
	print('\nCompressing pow(256,%d) base 256 represented in bytes' % i)
	Num = pow(256,i)
	StrNum = BaseChange(Num, Bytes)	# mostly null chars
	compression(StrNum, "power of 256")


	#################################
	# Compressing a Random Number   #
	#################################
	i = 10000
	print('\nCompressing a pseudo-Random Number larger than 10^%d represented in bytes' % i)
	Num = random.randint(pow(10,i),pow(10,i+1)-1)
	# Changing base to 256 to use full string expressive power
	StrNum = BaseChange(Num, Bytes)	# converts base 10 into 256
	compression(StrNum, "random number", Encoding='latin-1')
	

	##############################
	# Compressing Pi             #
	##############################
	print('\nCompressing Pi represented in bytes')
	PIDECIMALSFILE = "Pi_decimals.txt"
	try:
		TextSample = open(PIDECIMALSFILE).read()
		TextSample = re.sub(r'[\s\.\,]', '', TextSample)	# gets rid of spaces and punctations
		##### Add relevant line here #######
		(CompSize, CompFactor) = compression(TextSample, "Pi in base 10")
		if CompFactor > 1:	print("Strange: Pi's decimals seem to be compressible !!")
	except IOError:
		print(f"To compress 'Pi', please provide a text file '{PIDECIMALSFILE}' with decimal digits of Pi")






Compressing pow(10,10000) base 10
Compressing 1000000000000000000000000000000000000000 ...
power of ten: size 10001 reduced to --> 34 (compression 99.7%)

Compressing pow(256,10000) base 256 represented in bytes
Compressing                                         ...
power of 256: size 10001 reduced to --> 32 (compression 99.7%)

Compressing a pseudo-Random Number larger than 10^10000 represented in bytes
Compressing MÛ©·· Å5!>Ê ¶ä¸(âï[M+vwkíÐpy¾'E ...
random number: size 4153 reduced to --> 4164 (compression -0.3%)

Compressing Pi represented in bytes
Compressing 3141592653589793238462643383279502884197 ...
Pi in base 10: size 10000 reduced to --> 5089 (compression 49.1%)
Strange: Pi's decimals seem to be compressible !!


In [51]:
	##############################
	# Compressing Pi             #
	##############################
	print('\nCompressing Pi represented in bytes')
	PIDECIMALSFILE = "Pi_decimals.txt"
	try:
		TextSample = open(PIDECIMALSFILE).read()
		TextSample = re.sub(r'[\s\.\,]', '', TextSample)	# gets rid of spaces and punctations
		TextSample = BaseChange(TextSample, Bytes)	# converts base 10 into 256
		(CompSize, CompFactor) = compression(TextSample, "Pi in base 256")
		if CompFactor > 1:	print("Strange: Pi's decimals seem to be compressible !!")
	except IOError:
		print(f"To compress 'Pi', please provide a text file '{PIDECIMALSFILE}' with decimal digits of Pi")





Compressing Pi represented in bytes
Compressing ¹S¡Æí2aw¹Õ.>~Ð¦Eü[§oëþ+Ý§6 ...
Pi in base 256: size 4153 reduced to --> 4164 (compression -0.3%)


In [20]:
import sys
import os
import zlib


Str = open('./kolmo.txt', 'rb').read().replace(b'\r\n', b' ').replace(b'\r', b' ').replace(b'\n', b' ')
print(len(Str))
for zipIteration in range(1237):
    print('Iteration %d: length = %d' % (zipIteration, len(Str)))
    Str = zlib.compress(Str)

24895
Iteration 0: length = 24895
Iteration 1: length = 8705
Iteration 2: length = 8716
Iteration 3: length = 8727
Iteration 4: length = 8738
Iteration 5: length = 8749
Iteration 6: length = 8760
Iteration 7: length = 8771
Iteration 8: length = 8782
Iteration 9: length = 8793
Iteration 10: length = 8804
Iteration 11: length = 8815
Iteration 12: length = 8826
Iteration 13: length = 8837
Iteration 14: length = 8848
Iteration 15: length = 8859
Iteration 16: length = 8870
Iteration 17: length = 8881
Iteration 18: length = 8892
Iteration 19: length = 8903
Iteration 20: length = 8912
Iteration 21: length = 8923
Iteration 22: length = 8934
Iteration 23: length = 8945
Iteration 24: length = 8956
Iteration 25: length = 8967
Iteration 26: length = 8978
Iteration 27: length = 8989
Iteration 28: length = 9000
Iteration 29: length = 9011
Iteration 30: length = 9022
Iteration 31: length = 9033
Iteration 32: length = 9044
Iteration 33: length = 9055
Iteration 34: length = 9066
Iteration 35: length = 

Iteration 1067: length = 21977
Iteration 1068: length = 21993
Iteration 1069: length = 22009
Iteration 1070: length = 22025
Iteration 1071: length = 22041
Iteration 1072: length = 22057
Iteration 1073: length = 22073
Iteration 1074: length = 22089
Iteration 1075: length = 22105
Iteration 1076: length = 22121
Iteration 1077: length = 22137
Iteration 1078: length = 22153
Iteration 1079: length = 22169
Iteration 1080: length = 22185
Iteration 1081: length = 22201
Iteration 1082: length = 22214
Iteration 1083: length = 22230
Iteration 1084: length = 22246
Iteration 1085: length = 22262
Iteration 1086: length = 22278
Iteration 1087: length = 22294
Iteration 1088: length = 22310
Iteration 1089: length = 22326
Iteration 1090: length = 22342
Iteration 1091: length = 22358
Iteration 1092: length = 22374
Iteration 1093: length = 22390
Iteration 1094: length = 22406
Iteration 1095: length = 22422
Iteration 1096: length = 22434
Iteration 1097: length = 22450
Iteration 1098: length = 22466
Iteratio

In [4]:
fr = 5540000000/(2*10**12)

In [38]:
np.log(1/fr)

5.88890795878289

In [39]:
# book
fb = 14240000000/(2*10**12)
np.log(1/fb)

4.944847553558253

In [40]:
# calligraphy
fc = 202000000/(2*10**12)
np.log(1/fc)

9.200390041123015

In [44]:
# echitruc
fe = 811000/(2*10**12)
np.log(1/fe)

14.718144963390943

In [43]:
un,deux,trois,quatre = 8247, 10793, 2151, 2829

In [46]:
un_deux = 18022
trois_quatre = 4352

In [54]:
NCD_12 = 1 - (un_deux - un) / deux

In [55]:
NCD_34 = 1 - (trois_quatre - trois) / quatre

In [59]:
NCD_12 - NCD_34

-0.127666174844548