In [1]:
import pandas as pd
import numpy as np 
import requests
from PIL import Image

pd.set_option('precision', 2)

## 清洗从网页上获取的各个国家的数据

In [2]:
info_df=pd.read_csv('infos.csv')
info_df=info_df.drop([0,6,7,8],axis=0).T
pd.set_option('display.max_rows',10)
info_df.drop(info_df.index[0],axis=0,inplace=True)
info_df.columns=['Actual','Previous','Highest','Lowest','Dates']
info_df 

Unnamed: 0,Actual,Previous,Highest,Lowest,Dates
ARGENTINA,89.40,86.00,166.70,34.50,1997 - 2019
AUSTRIA,70.40,74.00,84.90,56.10,1988 - 2019
AUSTRALIA,45.10,41.50,45.10,9.70,1989 - 2019
BELGIUM,98.60,99.80,133.10,74.10,1980 - 2019
BRAZIL,75.79,76.53,76.53,51.27,2006 - 2019
...,...,...,...,...,...
SWEDEN,35.10,38.80,72.40,35.10,1994 - 2019
SINGAPORE,126.30,112.20,126.30,67.40,1993 - 2019
THAILAND,41.80,41.20,57.80,15.20,1996 - 2018
TAIWAN,30.90,30.80,34.00,11.19,1992 - 2018


## 处理图片

In [3]:
# 处理图片获得各个点的坐标
def process_pic(image):
    width = image.width
    height = image.height
#     print(width,height)
    image_list = []
    black_list=[]
    for x in range(height):
        scanline_list = []
        for y in range(width):
            pixel = image.getpixel((y, x))
            scanline_list.append(pixel)
            if pixel==(0, 0, 0, 255): #如果点是黑色的
                black_list.append([x,y])

        image_list.append(scanline_list)

    data_coor=[]
    for pixel in black_list:
        if pixel[1]>670 or pixel[0]>300: #这个部分为坐标轴区域，不是数据
            continue
        if [pixel[0],pixel[1]-1]in black_list: # 如果旁边已经有黑点了就跳过，一年只考虑一个点
            continue
        data_coor.append(pixel)
#     print(data_coor)   
    coor_pd=pd.DataFrame(data_coor).sort_values(1,ascending=False)
    coor_pd.index=[int(info_df['Dates'][country][-4:])-i for i in range(len(coor_pd))]
    coor_pd.columns=['width','height']
    coor_pd.head()
    return coor_pd

In [4]:
# 根据不同年份的坐标获得ratio估计值
def predict(coor_pd):
    y_coor_max=min(coor_pd['width'])
    y_coor_min=max(coor_pd['width'])

    real_max=float(info_df['Highest'][country])
    real_min=float(info_df['Lowest'][country])

    # 一元方程y=ax+b求解参数
    a=(real_max-real_min)/(y_coor_max-y_coor_min)
    b=real_max-a*y_coor_max
    
#     print(a,b)
    data_pred=[]
    for year_height in coor_pd['width']:
    #     print(year_height,year_height*a+b)
        data_pred.append(year_height*a+b)
    coor_pd[country]=data_pred
    
#     print(coor_pd.head())
    return coor_pd


In [5]:
# 主循环
for country in info_df.index:
    path='./img/{}.jpg'.format(country)
    image=Image.open(path)
    coor_pd=process_pic(image) 
    coor_pd=predict(coor_pd)
    if country=='ARGENTINA':
        country_ratio=coor_pd[country]
    else:
        country_ratio=pd.concat([country_ratio,coor_pd[country]],axis=1)
country_ratio

Unnamed: 0,ARGENTINA,AUSTRIA,AUSTRALIA,BELGIUM,BRAZIL,CANADA,SWITZERLAND,CHINA,CZECH-REPUBLIC,GERMANY,...,PHILIPPINES,POLAND,PORTUGAL,ROMANIA,RUSSIA,SWEDEN,SINGAPORE,THAILAND,TAIWAN,SOUTH-AFRICA
1972,,,,,,,,,,,...,,,,,,,,,,
1973,,,,,,,,,,,...,,,,,,,,,,
1974,,,,,,,,,,,...,,,,,,,,,,
1975,,,,,,,,,,,...,,,,,,,,,,
1976,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015,52.50,84.90,37.74,105.02,65.55,91.41,39.50,41.09,40.05,72.12,...,44.73,51.30,131.15,37.83,13.57,43.86,103.30,44.48,31.55,49.26
2016,53.06,82.90,40.54,104.73,69.82,91.98,38.17,44.18,36.82,69.27,...,42.12,54.32,131.50,37.33,12.87,42.28,111.43,40.80,31.20,51.54
2017,56.44,78.28,41.07,101.61,73.76,90.27,39.20,46.78,34.71,65.31,...,42.12,50.59,126.18,35.09,13.57,40.70,110.59,41.32,30.77,53.10
2018,85.69,73.97,41.42,99.63,76.53,89.71,37.13,50.50,32.60,61.96,...,41.87,48.82,121.93,34.72,12.16,38.78,112.28,41.85,30.85,56.66


In [6]:
info_df.T.append(country_ratio.sort_index(ascending=False)).to_csv('ratio.csv')