# Notebook to process manually downloaded data from GKS
In the notebook I transform the data already downloaded from https://showdata.gks.ru/olap2/descr/report/277326/ (GKS). Initially there are 2 columns in the dataframe: date (cyrillic string) and value (YoY inflation of all goods and utils in Russia, however different types can be chosen from the site, e.g. MoM or some specific goods and utils or some region instead of Russia). The output is a csv file with dates and values in standard format.

In [44]:
import os
import pandas as pd

In [54]:
directory = os.getcwd()

files = os.listdir(directory)

for file in files:
    if file.endswith('.csv') and file != 'data_raw.csv' and file != 'data.csv':
        old_file_name = os.path.join(directory, file)
        new_file_name = os.path.join(directory, 'data_raw.csv')
        
        if os.path.exists(old_file_name):
            if os.path.exists(new_file_name):
                os.remove(new_file_name)
    
            os.rename(old_file_name, new_file_name)
        
        print(f'Renamed "{file}" to "data_raw.csv"')
        break
    
os.listdir(os.getcwd())

Renamed "11521100300010200001_Индексы_потребительских_цен_на_товары_и_услуги.csv" to "data_raw.csv"


['data_download.ipynb', 'data_process.ipynb', 'data_raw.csv']

In [55]:
file_path = os.path.join(os.getcwd(), "data_raw.csv")

df = pd.read_csv(file_path, encoding='cp1251')

df.head()

Unnamed: 0.1,Unnamed: 0,11521100300010200001 Индексы потребительских цен на товары и услуги
0,январь 2016 г.,10977
1,февраль 2016 г.,10806
2,март 2016 г.,10727
3,апрель 2016 г.,10725
4,май 2016 г.,10731


In [56]:
from datetime import datetime

month_map = {
    'январь': '01',
    'февраль': '02',
    'март': '03',
    'апрель': '04',
    'май': '05',
    'июнь': '06',
    'июль': '07',
    'август': '08',
    'сентябрь': '09',
    'октябрь': '10',
    'ноябрь': '11',
    'декабрь': '12'
}

def convert_date(cyrillic_date):
    parts = cyrillic_date.split()
    month = month_map[parts[0]]
    year = parts[1]
    formatted_date = f'01.{month}.{year}'
    formatted_date = pd.to_datetime(formatted_date, format='%d.%m.%Y')
    return formatted_date

df.iloc[:, 0] = df.iloc[:, 0].apply(convert_date)

df.rename(columns={df.columns[0]: 'date'}, inplace=True)
df.rename(columns={df.columns[1]: 'CPI'}, inplace=True)

df['CPI'] = df['CPI'].str.replace(',', '.')

df['CPI'] = pd.to_numeric(df['CPI'], errors='coerce')

df.head()

Unnamed: 0,date,CPI
0,2016-01-01,109.77
1,2016-02-01,108.06
2,2016-03-01,107.27
3,2016-04-01,107.25
4,2016-05-01,107.31


In [58]:
df.to_csv('data.csv', index=False)