In [1]:
import numpy as np
import pandas as pd
from rat import ratmath
from rat import ratrestore
from rat import ratedit

# Extract some Data

In [2]:
df_a = pd.read_csv('tables/employment.csv')
df_a.head()

Unnamed: 0,District,Agriculture,Industry,Production,Service
0,East Forests,2063,3644,504,5562
1,North Mountains,1258,3807,862,11540
2,South Deserts,677,4051,2699,6029
3,West Steppes,3309,66,275,11996


In [3]:
df_b = pd.read_csv('tables/environment.csv')
df_b.head()

Unnamed: 0,District,Urban,Suburban,Rural
0,East Forests,3866,1510,6397
1,North Mountains,3438,5779,8250
2,South Deserts,6971,4941,1544
3,West Steppes,3169,7188,5289


In [4]:
df_c = pd.read_csv('tables/family.csv')
df_c.head()

Unnamed: 0,District,Sex,Single,Marriged,Widower
0,East Forests,Female,2545,2248,314
1,East Forests,Male,1702,2920,2044
2,North Mountains,Female,3059,3352,617
3,North Mountains,Male,3207,4578,2654
4,South Deserts,Female,3197,2526,1516


# ratmath

In [5]:
lines_a = ratmath.get_lines(df_a, name_cols=['District'])
lines_a

array([[ 2063,  3644,   504,  5562],
       [ 1258,  3807,   862, 11540],
       [  677,  4051,  2699,  6029],
       [ 3309,    66,   275, 11996]])

In [6]:
lines_b = ratmath.get_lines(df_b, name_cols=['District'])
lines_b

array([[6397, 1510, 3866],
       [8250, 5779, 3438],
       [1544, 4941, 6971],
       [5289, 7188, 3169]])

In [7]:
line_a, line_b = lines_a[0], lines_b[0]

In [8]:
ratmath.get_system(line_a, line_b)

(array([[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1],
        [1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0],
        [0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0],
        [0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1]]),
 array([2063, 3644,  504, 5562, 6397, 1510, 3866]))

In [9]:
ratmath.get_problem(line_a, line_b)

(Problem(Minimize(Expression(CONVEX, NONNEGATIVE, ())), [Equality(Expression(AFFINE, NONNEGATIVE, (6,)), Constant(CONSTANT, NONNEGATIVE, (6,)))]),
 Variable((12,), nonneg=True))

In [10]:
ratmath.restore_line(line_a, line_b, print_status=True)

Status:  optimal_inaccurate
The optimal value is 23820305.91149172
A solution x is
[1268.10779075  244.74081494  611.21857335 2161.59040233  129.78623945
 1504.70118493    9.24416831  380.63035393   57.44916523 2995.46432034
  485.05031314 2338.57510293] 





array([1268.10779075,  244.74081494,  611.21857335, 2161.59040233,
        129.78623945, 1504.70118493,    9.24416831,  380.63035393,
         57.44916523, 2581.90589071,  485.05031314, 2338.57510293])

# ratrestore

In [11]:
name_cols, targets_a, targets_b = ratrestore.get_names_and_targets(df_a, df_b)
name_cols, targets_a, targets_b

(array(['District'], dtype=object),
 array(['Agriculture', 'Industry', 'Production', 'Service'], dtype=object),
 array(['Rural', 'Suburban', 'Urban'], dtype=object))

In [12]:
ratrestore.get_rows_target_names(targets_a, targets_b)

(array(['Agriculture', 'Agriculture', 'Agriculture', 'Industry',
        'Industry', 'Industry', 'Production', 'Production', 'Production',
        'Service', 'Service', 'Service'], dtype=object),
 array(['Rural', 'Suburban', 'Urban', 'Rural', 'Suburban', 'Urban',
        'Rural', 'Suburban', 'Urban', 'Rural', 'Suburban', 'Urban'],
       dtype=object))

In [13]:
df_res = ratrestore.restore_table(df_a, df_b, name_a='Employment', name_b='Environment', name_res='Count')
df_res.head(15)

Unnamed: 0,District,Employment,Environment,Count
0,East Forests,Agriculture,Rural,1268.107791
1,East Forests,Agriculture,Suburban,244.740815
2,East Forests,Agriculture,Urban,611.218573
3,East Forests,Industry,Rural,2161.590402
4,East Forests,Industry,Suburban,129.786239
5,East Forests,Industry,Urban,1504.701185
6,East Forests,Production,Rural,9.244168
7,East Forests,Production,Suburban,380.630354
8,East Forests,Production,Urban,57.449165
9,East Forests,Service,Rural,2581.905891


# ratedit

In [14]:
df_c

Unnamed: 0,District,Sex,Single,Marriged,Widower
0,East Forests,Female,2545,2248,314
1,East Forests,Male,1702,2920,2044
2,North Mountains,Female,3059,3352,617
3,North Mountains,Male,3207,4578,2654
4,South Deserts,Female,3197,2526,1516
5,South Deserts,Male,1899,1693,2625
6,West Steppes,Female,3032,2312,1988
7,West Steppes,Male,3756,3652,906


In [15]:
df_weak = ratedit.roll_weak(df_c, ['Single', 'Marriged', 'Widower'], 
                            value_name='Family Status', res_name='Persons')
df_weak.head(12)

Unnamed: 0,District,Sex,Family Status,Persons
0,East Forests,Female,Single,2545
1,East Forests,Female,Marriged,2248
2,East Forests,Female,Widower,314
3,East Forests,Male,Single,1702
4,East Forests,Male,Marriged,2920
5,East Forests,Male,Widower,2044
6,North Mountains,Female,Single,3059
7,North Mountains,Female,Marriged,3352
8,North Mountains,Female,Widower,617
9,North Mountains,Male,Single,3207


In [16]:
df_strong, cols_strong = ratedit.roll_strong(df_c, ['District'], ['Sex'], ['Single', 'Marriged', 'Widower'])
df_strong, cols_strong

(          District col_0 col_1 col_2 col_3 col_4 col_5
 0     East Forests  2545  1702  2248  2920   314  2044
 1  North Mountains  3059  3207  3352  4578   617  2654
 2    South Deserts  3197  1899  2526  1693  1516  2625
 3     West Steppes  3032  3756  2312  3652  1988   906,
           Sex     Value
 col_0  Female    Single
 col_1  Female    Single
 col_2  Female  Marriged
 col_3    Male  Marriged
 col_4    Male   Widower
 col_5    Male   Widower)

In [17]:
a_columns = ['Agriculture', 'Industry', 'Production', 'Service']
df_a

Unnamed: 0,District,Agriculture,Industry,Production,Service
0,East Forests,2063,3644,504,5562
1,North Mountains,1258,3807,862,11540
2,South Deserts,677,4051,2699,6029
3,West Steppes,3309,66,275,11996


In [18]:
df_noncorect = df_a.copy()
df_noncorect.loc[df_noncorect.index, a_columns] += np.random.normal(size=[4, 4])
df_noncorect

Unnamed: 0,District,Agriculture,Industry,Production,Service
0,East Forests,2063.442241,3644.227029,503.974902,5561.723
1,North Mountains,1256.70831,3806.303721,863.701223,11539.527866
2,South Deserts,675.912778,4051.426559,2701.291853,6028.035904
3,West Steppes,3309.595765,66.474966,274.6744,11996.029298


In [19]:
real_sums = df_a[a_columns].values.sum(axis=1)
real_sums

array([11773, 17467, 13456, 15646])

In [20]:
correct_table = ratedit.correct_table(df_noncorect, a_columns, real_sums, by_div=True, by_add=True)
correct_table

Unnamed: 0,District,Agriculture,Industry,Production,Service
0,East Forests,2063.37789,3644.113378,503.959184,5561.549548
1,North Mountains,1256.762912,3806.469099,863.73875,11540.02924
2,South Deserts,675.879271,4051.225716,2701.157941,6027.737073
3,West Steppes,3309.431959,66.471676,274.660805,11995.435561


In [21]:
correct_table[a_columns].values.sum(axis=1) == real_sums

array([ True,  True,  True,  True])