# Data Integration over price & volume data and fundamental data

Below I import price and volume data as well as the fundamental data from exsiting dataset. I then do:
* Drop text data
* Merge the fundamental data to price and volume data, so that every day we should expect data for price, volume and fundamental data
* Impute missing data in merged daily fundamental data in the following appetite:
    * If the current day's data is missing, impute this day's data with the most recent valid data
    * Impute missing data before the first record days with 0 (after all we truly cannot see any data at this time), the statistics has shown that such 0 takes a very trival proportion of all the data
* Such imputation strategy can be justified as such that while we acknowledge fundamental data cannot be expected to be updated publicly on a daily basis, it is still a great resource for us to make decision during daily investment so we still need to look at it on a daily basis, and thus if there is no updated fundamental data available this day we can only refer to the most recent fundamental data as a current status of the given ticker.

In [1]:
import pandas as pd

f = pd.read_csv("fundamental.csv", index_col=0)
f

Unnamed: 0,currency_symbol,totalAssets,intangibleAssets,earningAssets,otherCurrentAssets,totalLiab,totalStockholderEquity,deferredLongTermLiab,otherCurrentLiab,commonStock,...,netIncomeFromContinuingOps,netIncomeApplicableToCommonShares,preferredStockAndOtherAdjustments,beforeAfterMarket,currency,epsActual,epsEstimate,epsDifference,surprisePercent,Ticker
1985-09-30,USD,,,,,,,,,,...,,,,,,,,,,MMM
1985-12-31,USD,,,,,,,,,,...,,,,,,,,,,MMM
1986-03-31,USD,,,,,,,,,,...,,,,,,,,,,MMM
1986-06-30,USD,,,,,,,,,,...,,,,,,,,,,MMM
1986-09-30,USD,,,,,,,,,,...,,,,,,,,,,MMM
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-06-30,USD,1.377000e+10,1.390000e+09,,507000000.0,9.190000e+09,4.580000e+09,274000000.0,1.269000e+09,5000000.0,...,529000000.0,529000000.0,,BeforeMarket,USD,1.2,1.22,-0.02,-1.6393,ZTS
2022-09-30,,,,,,,,,,,...,,,,BeforeMarket,USD,,1.25,,,ZTS
2022-12-31,,,,,,,,,,,...,,,,AfterMarket,USD,,,,,ZTS
2023-03-31,,,,,,,,,,,...,,,,BeforeMarket,USD,,,,,ZTS


In [2]:
cols = f.columns
for col in cols[:-1]:
    if f[col].dtype == float or f[col].dtype == int:
        continue
    else:
        print("ignore " + col)
        f = f.drop(col, axis = 1)

ignore currency_symbol
ignore beforeAfterMarket
ignore currency


In [3]:
d = pd.read_csv("data.csv")
f = f.reset_index().rename(columns={'index': 'Date'})
m = pd.merge(d, f, left_on=['Date', 'Ticker'], right_on=['Date', 'Ticker'], how = 'left')
m

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Ticker,Adj Close,...,costOfRevenue,totalOtherIncomeExpenseNet,discontinuedOperations,netIncomeFromContinuingOps,netIncomeApplicableToCommonShares,preferredStockAndOtherAdjustments,epsActual,epsEstimate,epsDifference,surprisePercent
0,1962-01-02,0.000000,0.771044,0.748367,0.754036,212800.0,0.0,0.0,MMM,,...,,,,,,,,,,
1,1962-01-03,0.000000,0.759705,0.741280,0.759705,422400.0,0.0,0.0,MMM,,...,,,,,,,,,,
2,1962-01-04,0.000000,0.772462,0.759705,0.759705,212800.0,0.0,0.0,MMM,,...,,,,,,,,,,
3,1962-01-05,0.000000,0.756871,0.737028,0.739863,315200.0,0.0,0.0,MMM,,...,,,,,,,,,,
4,1962-01-08,0.000000,0.741280,0.731358,0.735610,334400.0,0.0,0.0,MMM,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4107858,2022-09-30,150.419998,152.029999,148.039993,148.289993,2437200.0,0.0,0.0,ZTS,,...,,,,,,,,1.25,,
4107859,2022-10-03,149.279999,152.850006,148.309998,151.350006,1580200.0,0.0,0.0,ZTS,,...,,,,,,,,,,
4107860,2022-10-04,153.240005,155.309998,152.600006,154.750000,1676300.0,0.0,0.0,ZTS,,...,,,,,,,,,,
4107861,2022-10-05,152.740005,155.630005,152.000000,154.589996,1260200.0,0.0,0.0,ZTS,,...,,,,,,,,,,


In [4]:
t = m[['Ticker']]
t['index'] = m.index
first_record_index = t.groupby('Ticker')['index'].min().values
m.loc[first_record_index, :] = m.loc[first_record_index, :].fillna(0)
m = m.fillna(method="ffill")
m

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  t['index'] = m.index


Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Ticker,Adj Close,...,costOfRevenue,totalOtherIncomeExpenseNet,discontinuedOperations,netIncomeFromContinuingOps,netIncomeApplicableToCommonShares,preferredStockAndOtherAdjustments,epsActual,epsEstimate,epsDifference,surprisePercent
0,1962-01-02,0.000000,0.771044,0.748367,0.754036,212800.0,0.0,0.0,MMM,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0000
1,1962-01-03,0.000000,0.759705,0.741280,0.759705,422400.0,0.0,0.0,MMM,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0000
2,1962-01-04,0.000000,0.772462,0.759705,0.759705,212800.0,0.0,0.0,MMM,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0000
3,1962-01-05,0.000000,0.756871,0.737028,0.739863,315200.0,0.0,0.0,MMM,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0000
4,1962-01-08,0.000000,0.741280,0.731358,0.735610,334400.0,0.0,0.0,MMM,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4107858,2022-09-30,150.419998,152.029999,148.039993,148.289993,2437200.0,0.0,0.0,ZTS,0.0,...,625000000.0,-56000000.0,0.0,529000000.0,529000000.0,0.0,1.2,1.25,-0.02,-1.6393
4107859,2022-10-03,149.279999,152.850006,148.309998,151.350006,1580200.0,0.0,0.0,ZTS,0.0,...,625000000.0,-56000000.0,0.0,529000000.0,529000000.0,0.0,1.2,1.25,-0.02,-1.6393
4107860,2022-10-04,153.240005,155.309998,152.600006,154.750000,1676300.0,0.0,0.0,ZTS,0.0,...,625000000.0,-56000000.0,0.0,529000000.0,529000000.0,0.0,1.2,1.25,-0.02,-1.6393
4107861,2022-10-05,152.740005,155.630005,152.000000,154.589996,1260200.0,0.0,0.0,ZTS,0.0,...,625000000.0,-56000000.0,0.0,529000000.0,529000000.0,0.0,1.2,1.25,-0.02,-1.6393
