### Build a dict for language proportion using sa2 code as keys

In [None]:
from couchback_temp import CouchInterface
import pandas as pd

## Get language proportion per sa2 dataframe: e.g.
## [{'206011105': {'Proportion': 0.29843663941024445}}, {'206011106': {'Proportion': 0.2914498871110239}}]

ci = CouchInterface(address='172.26.134.62', port='5984', username='admin', password='password')
sa2_languages = ci.non_grouped_results(db_name="aurin_lsahbsc_sa2", design_doc="filter", view_name="default")

## Build the dict
lang_prop_dict = {}
for output in sa2_languages:
    # get the initial key-value pair
    sa2 = list(output.keys())[0]
    value = list(output.values())[0]
    
    if 'Proportion' in value:
        prop = value["Proportion"]
    
    # add to dict
    lang_prop_dict[sa2] = prop

lang_prop_dict

### Merge language proportions and suburb polygons based on sa2 code
Return a (geo)dataframe and a (geo)json string. While each dataframe row / each geojson object is a suburb (with a specific sa2 code) and includes the language proportion and the suburb polygon.

In [None]:
from couchback_temp import CouchInterface
import pandas as pd

In [None]:
## Get language proportion per sa2 dataframe

ci = CouchInterface(address='172.26.134.62', port='5984', username='admin', password='password')
sa2_languages = ci.non_grouped_results(db_name="aurin_lsahbsc_sa2", design_doc="filter", view_name="default")

## get a list of re-structured dict / json object (each object is a suburb)
for output in sa2_languages:
    # get the initial key-value pair
    sa2 = list(output.keys())[0]
    value = list(output.values())[0]
    
    if 'Proportion' in value:
        prop = value["Proportion"]
    
    # re-construct the dict
    output["sa2"] = sa2
    output["prop"] = prop
    del output[sa2]
    
sa2_languages[:10]

In [None]:
## convert to dataframe
languages_df = pd.DataFrame(sa2_languages)
languages_df.head()

In [None]:
## Get polygon pf each sa2 dataframe

from shapely.geometry import Polygon

sa2_polygons = ci.non_grouped_results(db_name="abs_austgeo_sa2", design_doc="filter", view_name="default")
sa2_polygons[:2]

## get a list of re-structured dict / json object (each object is a suburb)
for output in sa2_polygons:
    # get the initial key-value pair
    sa2 = list(output.keys())[0]
    value = list(output.values())[0]
    
    # re-construct the dict
    try:
        output["sa2"] = str(sa2)
        output["name"] = value['SA2_NAME16']
        
        ## round coordinates (a list of lists of coords - map will apply the lambda fun to both elements)
        rounded_polygon = [list(map(lambda x:round(x, 5), coords)) for coords in value['geometry'][0]]
        
        ## convert to a polygon object
        output["geometry"] = Polygon(rounded_polygon)
        del output[sa2]
        
    except(TypeError):
        # some empty geometry? can't get rounded_polygon
        del output[sa2]

## convert to dataframe
polygons_df = pd.DataFrame(sa2_polygons)
polygons_df.head()

In [None]:
## Merge two dataframes based on sa2 code

merged_df = languages_df.merge(polygons_df, on="sa2")
merged_df.head()

In [None]:
## Convert to geopandas dataframe and write as a geojson file

import geopandas as gpd

polygon_gdf = gpd.GeoDataFrame(merged_df, geometry=merged_df["geometry"])
polygon_gdf.head()

In [None]:
polygon_gdf.to_file("polygon_vs_proportion.geojson", driver="GeoJSON")

In [None]:
## 1. Read a geojson file as a dict
import json

with open("polygon_vs_proportion.geojson") as f:
    geo_dict = json.load(f)

geo_dict

In [None]:
## 2. Read a geojson file as a geopandas dataframe
import geopandas as gpd

geo_df = gpd.read_file('polygon_vs_proportion.geojson')
geo_df.head()

In [None]:
## Convert a geopandas dataframe to (geo)json (string representaion of dict)
geo_df.to_json()

## Load the string representation of json/dict back into dict
import json
json.loads(geo_df.to_json())


### Individual Tweets with coordinates and compound sentiment score
Return a (geo)dataframe and a (geo)json string. While each dataframe row / each geojson object is an individual tweet and includes the compound sentiment score and the point coordinates.

In [None]:
from couchback_temp import CouchInterface
import pandas as pd

ci = CouchInterface(address='172.26.134.62', port='5984', username='admin', password='password')

## Since we want individual tweets, the view must have no reduce function, and no group.
## If there is a reduce function and also no group, there will be a full aggregation and no key available.
valid_tweets = ci.non_grouped_results(db_name="twitter_historic", 
                                       design_doc="tweets", view_name="election_tweets")

In [None]:
from datetime import datetime
from shapely.geometry import Point

tweets = []

## get a list of re-structured tweet dict / json object
for tweet in valid_tweets:
    value = list(tweet.values())[0]
    
    # convert tweet timestamp to python datatime
    # e.g. 'Sun Aug 03 08:25:21 +0000 2014' to 2014-08-03 08:25:21
    dtime = value['time']
    new_dtime = datetime.strftime(datetime.strptime(dtime,'%a %b %d %H:%M:%S +0000 %Y'), '%Y-%m-%d %H:%M:%S')
    
    # convert to a point object
    coord = value['coord']
    coord = Point(coord)
    
    tweets.append({"compound":value['compound'], "created_at":new_dtime, "text":value['text'],
                   "geometry":coord})

## convert to dataframe
tweets_df = pd.DataFrame(tweets); tweets_df.head()

In [None]:
## convert to geopandas dataframe

import geopandas as gpd

tweets_gdf = gpd.GeoDataFrame(tweets_df, geometry=tweets_df["geometry"])
tweets_gdf.head()

In [None]:
# convert to (geo)json string and then load back to dict
json.loads(tweets_gdf.to_json())

In [16]:
import numpy as np

x = np.array([5, 15, 25, 35, 45, 55]).reshape((-1, 1))
y = np.array([5, 20, 14, 32, 22, 38])

reg = LinearRegression().fit(x, y)
print(reg.coef_, reg.intercept_)

[0.54] 5.633333333333329


In [17]:
import numpy as np
from sklearn.linear_model import LinearRegression

a = np.array([5, 15, 25, 35, 45, 55])
i = np.ones(a.shape)
X = np.vstack((i, a))

X = np.transpose(X)

y = np.array([6,10, 22,30, 38, 51])

X.dot(X.T)

array([[  26.,   76.,  126.,  176.,  226.,  276.],
       [  76.,  226.,  376.,  526.,  676.,  826.],
       [ 126.,  376.,  626.,  876., 1126., 1376.],
       [ 176.,  526.,  876., 1226., 1576., 1926.],
       [ 226.,  676., 1126., 1576., 2026., 2476.],
       [ 276.,  826., 1376., 1926., 2476., 3026.]])