In [1]:
import pandas as pd
import numpy as np

In [2]:
# Import the data, which is a stata file.
df = pd.read_stata('Braun_Franke_2022_data_full_sample.dta')
#Keep only relevant columns. These are: ID year ind_popshare_occ agr_popshare_occ trade_popshare_occ tax_income_base_priv_pc d_steamengine pop_log
df = df[['ID', 'name' ,'year', 'ind_popshare_occ', 'agr_popshare_occ', 'trade_popshare_occ', 'tax_income_base_priv_pc', 'd_steamengine', 'pop_log']]

In [3]:
#Import the conversion matrix.
conversion_matrix = pd.read_csv('Raw/mmborders_python.csv')

In [4]:
#Show NaN ID's.
df[df['ID'].isnull()]

Unnamed: 0,ID,name,year,ind_popshare_occ,agr_popshare_occ,trade_popshare_occ,tax_income_base_priv_pc,d_steamengine,pop_log
44592,,,1875,,,,,,
44593,,,1880,,,,,,
44594,,,1885,,,,,,
44595,,,1890,,,,,,
44596,,,1895,,,,,,
44597,,,1900,,,,,,
44598,,,1905,,,,,,
44599,,,1910,,,,,,


In [5]:
#Drop NaN ID's.
df = df.dropna(subset=['ID'])

In [6]:
#Convert NaN's in d_steamengine to 0.
df['d_steamengine'] = df['d_steamengine'].fillna(0)
#Convert the d_steamengine to an integer.
df['d_steamengine'] = df['d_steamengine'].astype(int)
#Have ID as integer.
df['ID'] = df['ID'].astype(int)
#Have id1 and id2 in conversion_matrix as an integer.
conversion_matrix['id1'] = conversion_matrix['id1'].astype(int)
conversion_matrix['id2'] = conversion_matrix['id2'].astype(int)

In [7]:
#We want to replace the d_steamengine with 1 in year 1905, if the same value in year 1867 is 1 for each ID.
#For i in unique ID's:
for i in df['ID'].unique():
    #If the value in year 1867 is 1:
    if df.loc[(df['ID'] == i) & (df['year'] == 1867), 'd_steamengine'].iloc[0] == 1:
        #Replace the value in year 1905 with 1.
        df.loc[(df['ID'] == i) & (df['year'] == 1905), 'd_steamengine'] = 1

In [8]:
#Save as a stata file.
df.to_stata('Dataset_Edited_With_Loop.dta')

In [9]:
#Use the correct ID's from the matrix.
for i in range(0,len(conversion_matrix)):
    df.loc[df["ID"] == conversion_matrix.iloc[i,0], "ID"] = conversion_matrix.iloc[i,1]
    #Groupby ID, summing only d_steamengine
    df = df.groupby(["ID", "year"]).agg({"d_steamengine": "sum", "ind_popshare_occ": "first", "agr_popshare_occ": "first", "trade_popshare_occ": "first", "tax_income_base_priv_pc": "first", "pop_log": "first"}).reset_index()

In [10]:
#Change 2 in d_steamengine to 1.
df['d_steamengine'] = df['d_steamengine'].replace(2, 1)

In [11]:
#Save as a stata file.
df.to_stata('Dataset_Edited_With_Loop_v2.dta')

In [2]:
#Start Stata.
import stata_setup
stata_setup.config("C:/Program Files/Stata17", "se")
from pystata import stata, config
config.init('se')


  ___  ____  ____  ____  ____ ®
 /__    /   ____/   /   ____/      17.0
___/   /   /___/   /   /___/       SE—Standard Edition

 Statistics and Data Science       Copyright 1985-2021 StataCorp LLC
                                   StataCorp
                                   4905 Lakeway Drive
                                   College Station, Texas 77845 USA
                                   800-STATA-PC        https://www.stata.com
                                   979-696-4600        stata@stata.com

Stata license: 300-user network, expiring 30 Nov 2022
Serial number: 401709314836
  Licensed to: Timur Öztürk
               University of Bayreuth

Notes:
      1. Unicode is supported; see help unicode_advice.
      2. Maximum number of variables is set to 5,000; see help set_maxvar.


In [3]:
%%stata 
use Dataset_Edited_With_Loop_v2.dta, clear


In [4]:
%stata sum


    Variable |        Obs        Mean    Std. dev.       Min        Max
-------------+---------------------------------------------------------
       index |     42,336     21167.5    12221.49          0      42335
          ID |     42,336    27198.38    11026.36      10101      41627
        year |     42,336    1870.083    27.97192       1821       1925
d_steameng~e |     42,336    .0085743    .0986369          0          5
ind_popsha~c |      5,291    9.420523     6.35755          0   44.03783
-------------+---------------------------------------------------------
agr_popsha~c |      3,527    34.18495    12.10751   .9994304   67.98337
trade_pops~c |      5,291    1.518217    1.732759          0   44.80796
tax_income~c |      1,763    323.2654      99.807   82.02615   995.6055
     pop_log |     42,292    6.489994    .7460759          0   10.71945


Exception in thread Stata:
Traceback (most recent call last):
  File "c:\Users\bt307300\AppData\Local\Programs\Python\Python310\lib\threading.py", line 1009, in _bootstrap_inner
    self.run()
  File "C:\Program Files/Stata17\utilities\pystata\core\stout.py", line 169, in run
    raise SystemError(output)
SystemError: command browse is unrecognized
r(199);

