# Douban_world_movies_analysis

## Import data

In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import re
from datetime import datetime
from zhtools import langconv

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
alt.renderers.enable('notebook')
%matplotlib inline

plt.rcParams['font.sans-serif'] = ['SimHei']  # 中文字体设置-黑体
plt.rcParams['axes.unicode_minus'] = False  # 解决保存图像是负号'-'显示为方块的问题
sns.set(font='SimHei')  # 解决Seaborn中文显示问题

In [None]:
#data = pd.read_csv("douban_data/douban-world-moive-raw-data.csv",error_bad_lines=False)
data = pd.read_csv("../douban-world-moive-raw-data.csv")

In [None]:
data.count()

## Dispose of data

剔除以下关键信息缺失的影片:
* actor 演员
* date 上映时间
* director 导演
* language 语言
* type 类型
* region 地区

In [None]:
cond = ((data['actor'].isnull()) | (data['actor']=='[]')| (data['date'].isnull()) | (data['date']=='[]') | 
        (data['director'].isnull()) | (data['director']=='[]')| (data['language'].isnull()) | (data['language']=='[]') | 
        (data['type'].isnull()) | (data['type']=='[]')| (data['region'].isnull()) | (data['region']=='[]'))

In [None]:
data1 = data.loc[~cond]
data1.shape

In [None]:
# 将str转为list
list_col = ['actor','date','director','language','region','type']
for col in list_col:
    data1[col] = data1[col].apply(lambda x:[i.strip().strip("'") for i in x[1:-1].split(",")])

In [None]:
data1.iloc[0].T

### Year

In [None]:
data_selected = data1[data1['year'].str.contains(r'\b\d{4}\b',na=False,regex=True)]

In [None]:
data_selected = data_selected[data_selected['year'].str.contains(r'^(191[3-9])|^(19[2-9]\d)|^(202[0])|^(20[0-1]\d)$',na=False,regex=True)]

In [None]:
year_count = data_selected['year'].value_counts().reset_index()
year_count.columns = ['year','count']
year_count = year_count.sort_values('year')
year_count.head()

### Region

In [None]:
#  [法国 / 波兰 / 瑞士] 转为 ['法国','波兰' ,'瑞士']
data_selected['region'] = data_selected['region'].map(lambda x:"".join(x).split("/"))
data_selected['region'] = data_selected['region'].map(lambda x :[i.strip() for i in x])

In [None]:
region_set = set()
for each in data_selected['region']:
    for i in each:
        region_set.add(i)

In [None]:
# 可以从上面看到同一个国家会有多个表达
# 对于中英混合的表达转换为纯中文，且将繁体转换为中文
def ZhEn2Zh(x):
    pattern = re.compile("[一-龥]+")
    if pattern.match(x):
        x = re.sub("[^一-龥]+","", x)   
    # 繁体转换为简体
    x = langconv.Converter('zh-hans').convert(x)     
    return x

In [None]:
data_selected['region'] = data_selected['region'].map(lambda x: [ZhEn2Zh(i) for i in x])

In [None]:
# 对于英文的表达，选一些主流地区进行转换
region_dict = {"America":"美国","American":"美国","US":"美国","U.S.A":"美国","USA":"美国","usa":"美国",
               "United States USA":"美国","Argentina":"阿根廷","Argentina)":"阿根廷","Mexico":"墨西哥",
               "Canada":"加拿大","CANADA":"加拿大","(Canada)":"加拿大","Brazil":"巴西","Brasil":"巴西",
               "BBC":"英国","UK":"英国","uk":"英国",
               "Australia":"澳大利亚","Austria":"澳大利亚",                             
               "china":"中国","China":"中国","中国杭州":"中国",
               "India":"印度","india":"印度","Japan":"日本","South Korea":"韩国",
               "(Spain)":"西班牙","Spain":"西班牙","Germany":"德国","Germany Germany":"德国",
               "Sweden":"瑞典","sweden":"瑞典","France":"法国","Franch":"法国","Italy":"意大利",
               "Russia":"俄罗斯","Russian":"俄罗斯","Russion":"俄罗斯",
               "俄国":"俄罗斯","俄语":"俄罗斯","Soviet Union":"苏联",
               "印尼":"印度尼西亚","(Indonesia)":"印度尼西亚","Indonesia":"印度尼西亚","indonesia":"印度尼西亚"}

def region_update(x):
    if x in region_dict:
        x = region_dict[x]
    return x

In [None]:
data_selected['region'] = data_selected['region'].map(lambda x: [region_update(i) for i in x])

In [None]:
# region_set_after = set()
# for each in data_selected['region']:
#     for i in each:
#         region_set1_after.add(i)
# region_set_after

In [None]:
region_year_list = []
for row in zip(data_selected['region'].tolist(),data_selected['year'].tolist()):
    region = row[0]
    year = row[1]
    for i in region:
        region_year_list.append([i,year])

In [None]:
df_region_year = pd.DataFrame(region_year_list)
df_region_year.columns = ['region','year']
df_region_year.head(1)

In [None]:
region_count = df_region_year['region'].value_counts().reset_index()
region_count.columns = ['region','count']
region_count = region_count.sort_values('count',ascending=False)
region_count.head()

### Region trend

In [None]:
region_count_year = df_region_year.groupby(['region','year']).agg({"region":"count"})
region_count_year.columns = ['count']
region_count_year = region_count_year.reset_index()

### Type

In [None]:
type_set = set()
for each in data_selected['type']:
    for i in each:
        type_set.add(i)
#type_set

In [None]:
# 可以从上面看到中文和英文的混合，以及繁体
# 对于中英混合的表达转换为纯中文，且将繁体转换为中文
data_selected['type'] = data_selected['type'].map(lambda x: [ZhEn2Zh(i) for i in x])

In [None]:
# type_set_after = set()
# for each in data_selected['type']:
#     for i in each:
#         type_set_after.add(i)
# type_set_after

In [None]:
type_count_dict = {}
for row in data_selected['type'].tolist():
    for i in row:
        type_count_dict[i] = type_count_dict.get(i,0) + 1

In [None]:
type_count = pd.DataFrame.from_dict(type_count_dict, orient='index').reset_index()
type_count.columns = ['type','count']
type_count = type_count.sort_values('count',ascending=False)
type_count.head()

### Month

In [None]:
def extract_month(x):
    x = x[0]
    # 例如'1993-09-08(法国)'
    pattern = re.compile("(\d{4})-(\d{1,2})-(\d{1,2})")
    m = pattern.match(x)
    if m:
        return str(datetime.strptime(m.group(), "%Y-%m-%d").month)
    else:
        # 例如'1979-07(中国大陆)'
        pattern2 = re.compile("(\d{4})-(\d{1,2})")
        m2 = pattern2.match(x)
        if m2:
            return str(datetime.strptime(m2.group(), "%Y-%m").month)

In [None]:
data_selected['month'] = data_selected['date'].apply(lambda x:extract_month(x))

In [None]:
month_count = data_selected['month'].value_counts().reset_index()
month_count.columns = ['month','count']
month_count.head()

### month_region

In [None]:
region_month_list = []
for row in zip(data_selected['region'].tolist(),data_selected['month'].tolist()):
    region = row[0]
    month = row[1]
    for i in region:
        region_month_list.append([i,month])

In [None]:
df_region_month = pd.DataFrame(region_month_list)
df_region_month.columns = ['region','month']
df_region_month.head(1)

In [None]:
region_count_month = df_region_month.groupby(['region','month']).agg({"region":"count"})
region_count_month.columns = ['count']
region_count_month = region_count_month.reset_index()

In [None]:
region_selected = region_count[:10]['region'].tolist()

In [None]:
region_count_month = region_count_month.loc[region_count_month['region'].isin(region_selected)].sort_values(['region','month'])
region_count_month.head()

### rate

In [None]:
data_selected_rate = data_selected.drop(data_selected[np.isnan(data_selected['rate'])].index)
data_selected_rate.rate = data_selected_rate.rate.astype(float)

### rate_region

In [None]:
region_rate_list = []
for row in zip(data_selected_rate['region'].tolist(),data_selected_rate['rate'].tolist()):
    region = row[0]
    rate = row[1]
    for i in region:
        region_rate_list.append([i,rate])

In [None]:
df_region_rate = pd.DataFrame(region_rate_list)
df_region_rate.columns = ['region','rate']
df_region_rate.head(1)

In [None]:
region_selected = region_count[:20]['region'].tolist()

In [None]:
mean_rate_region = df_region_rate.loc[df_region_rate['region'].isin(region_selected)].groupby('region').agg({"rate":"mean"}).sort_values("rate",ascending=False)

In [None]:
mean_rate_region = pd.merge(mean_rate_region,region_count,on=['region'],how='left').sort_values('count',ascending=False)

In [None]:
mean_rate_region.head()

### time

In [None]:
data_selected_time = data_selected[data_selected.runtime.str.contains(r'(\d{1,3})',na=False,regex=True)]
time = data_selected_time.runtime.str.extract(r'(\d{1,3})')

In [None]:
data_selected_time['runtime'] = time.astype(int)
data_selected_time = data_selected_time[data_selected_time.runtime<=300]

In [None]:
data_selected_time.head()

### time_region

In [None]:
region_runtime_list = []
for row in zip(data_selected_time['region'].tolist(),data_selected_time['runtime'].tolist()):
    region = row[0]
    runtime = row[1]
    for i in region:
        region_runtime_list.append([i,runtime])

In [None]:
df_region_runtime = pd.DataFrame(region_runtime_list)
df_region_runtime.columns = ['region','runtime']
df_region_runtime.head(1)

In [None]:
mean_time_region = df_region_runtime.loc[df_region_runtime['region'].isin(region_selected)].groupby('region').agg({"runtime":"mean"}).sort_values("runtime",ascending=False)

In [None]:
mean_time_region = pd.merge(mean_time_region,region_count,on=['region'],how='left').sort_values('count',ascending=False)

In [None]:
mean_time_region.head()

### rate_time_month_region

In [None]:
data_selected_rate_time_month = data_selected.drop(data_selected[np.isnan(data_selected['rate'])].index)
data_selected_rate_time_month.rate = data_selected_rate_time_month.rate.astype(float)
data_selected_rate_time_month = data_selected_rate_time_month[data_selected_rate_time_month.runtime.str.contains(r'(\d{1,3})',na=False,regex=True)]
time = data_selected_rate_time_month.runtime.str.extract(r'(\d{1,3})')
data_selected_rate_time_month['runtime'] = time.astype(int)
data_selected_rate_time_month = data_selected_rate_time_month[data_selected_rate_time_month.runtime<=300]

In [None]:
data_selected_rate_time_month.head()

In [None]:
data_selected_rate_time_month_region = pd.DataFrame()
for i in region_count.reset_index().sort_values(by = 'count',ascending=False)[:10]['index'] :
    data_selected_temp = data_selected_rate_time_month[data_selected_rate_time_month['region'].str.contains(i,na=False,regex=False)]
    data_selected_temp.region = i
    data_selected_rate_time_month_region = data_selected_rate_time_month_region.append(data_selected_temp)

## Plot

### Overall

In [None]:
year_selected = list(range(1913,1998,5)) + list(range(1998,2021,1))
year_selected = [str(x) for x in year_range]
data_selected_year = data_selected.loc[data_selected['year'].isin(year_selected)]

In [None]:
figsize = 20,30
figure, ax = plt.subplots(figsize=figsize)
sns.countplot(y='year', data=data_selected_year.sort_values(by='year',ascending=False), orient="v")
plt.show()

### region

In [None]:
figsize = 20,30
figure, ax = plt.subplots(figsize=figsize)
# sns.countplot(y='index',data=region_count.reset_index().sort_values(by = 'count',ascending=False)[:20], orient="v")
ax = sns.barplot(x="count", y="region", data=region_count[:20])
plt.show()

### region_trend

In [None]:
main_region = region_count[:10]['region'].tolist()
main_region

In [None]:
region_count_year_selected = region_count_year.loc[region_count_year['region'].isin(main_region)]

In [None]:
alt.Chart(region_count_year_selected).mark_line().encode(
    x='year',
    y='count',
    color=alt.Color('region', legend=alt.Legend(orient="right")),
).configure_view(
    height=600,
    width=400,
)

In [None]:
top3 = ['美国','日本','中国大陆']
df = region_count_year_selected.loc[(region_count_year_selected['region'].isin(top3)) &
                                     (region_count_year_selected['year'].isin(year_selected))]

In [None]:
figsize = 20,30
figure, ax = plt.subplots(figsize=figsize)
sns.barplot(y='region', x="count", hue='year', data=df, orient="h")
plt.show()

### type

In [None]:
figsize = 20,30
figure, ax = plt.subplots(figsize=figsize)
ax = sns.barplot(x="count", y="type", data=type_count.reset_index().sort_values(by = 'count',ascending=False)[:25])
plt.show()

### month

In [None]:
month_count['month'] = month_count['month'].apply(lambda x:int(x))
month_count = month_count.sort_values('month')
month_count.sort_values('count',ascending=False)

In [None]:
figsize = 20,10
figure, ax = plt.subplots(figsize=figsize)
sns.barplot(x='month',y='count',data=month_count, orient="v")
plt.show()

### month region

In [None]:
region_count_month['month'] = region_count_month['month'].apply(lambda x:int(x))
region_count_month.sort_values(['region','month'],inplace=True)

In [None]:
figsize = 20,30
figure, ax = plt.subplots(figsize=figsize)
sns.barplot(y='region', x="count", hue='month', data=region_count_month, orient="h")
plt.show()

In [None]:
alt.data_transformers.enable('default', max_rows=None)
alt.Chart(region_count_month).mark_bar().encode(
    x=alt.X('sum(count)', stack="normalize"),
    y='region',
    color='month:N',
    order=alt.Order(
      'month',
      sort='ascending'
    )
)

### rate

In [None]:
figsize = 30,20
figure, ax = plt.subplots(figsize=figsize)
sns.countplot(x='rate', data=data_selected_rate.sort_values(by='rate',ascending=False), orient="v")
# sns.distplot(data_selected_rate.rate)
plt.show()

In [None]:
data_selected_rate.rate.mean()

### rate_region

In [None]:
mean_rate_region.head()

In [None]:
figsize = 20,10
figure, ax = plt.subplots(figsize=figsize)
ax = sns.barplot(x='region',y= 'rate',data=mean_rate_region)
plt.show()

### time

In [None]:
figsize = 20,10
figure, ax = plt.subplots(figsize=figsize)
sns.distplot(data_selected_time.runtime)
plt.show()

### time_region

In [None]:
mean_time_region

In [None]:
figsize = 20,10
figure, ax = plt.subplots(figsize=figsize)
ax = sns.barplot(x='region',y= 'runtime',data=mean_time_region)
plt.show()

### rate_time_month_region

In [None]:
# iris = sns.load_dataset("iris")
g = sns.PairGrid(data_selected_rate_time_month_region , hue='region', vars=["month", "rate","runtime"])
# g = g.map(plt.scatter)
# g = g.map_upper(plt.scatter)
# g = g.map_lower(sns.kdeplot, cmap="Blues_d")
g = g.map_diag(plt.hist)
g = g.map_offdiag(plt.scatter)
g = g.add_legend()
plt.show()