# Trying Hidden Markov Models

In [1]:
import sklearn
sklearn.__version__

'0.19.1'

In [2]:
import numpy as np
from hmmlearn import hmm

In [3]:
startprob = np.array([0.6, 0.3, 0.1])
startprob

array([0.6, 0.3, 0.1])

In [4]:
transmat = np.array([[0.7, 0.2, 0.1], [0.3, 0.5, 0.2], [0.3, 0.3, 0.4]])
transmat

array([[0.7, 0.2, 0.1],
       [0.3, 0.5, 0.2],
       [0.3, 0.3, 0.4]])

In [5]:
means = np.array([[0.0, 0.0], [3.0, -3.0], [5.0, 10.0]])
means

array([[ 0.,  0.],
       [ 3., -3.],
       [ 5., 10.]])

In [6]:
covars = np.tile(np.identity(2), (3, 1, 1))
covars

array([[[1., 0.],
        [0., 1.]],

       [[1., 0.],
        [0., 1.]],

       [[1., 0.],
        [0., 1.]]])

In [7]:
model = hmm.GaussianHMM(n_components=3, covariance_type="full")
model.startprob_ = startprob
model.transmat_ = transmat
model.means_ = means
model.covars_ = covars
X, Z = model.sample(100)

In [8]:
X

array([[ 8.47982837e-01, -6.03137735e-01],
       [ 3.75544866e+00, -2.63403159e+00],
       [ 2.03964686e+00, -1.58816463e+00],
       [ 3.90972710e+00, -2.68277761e+00],
       [ 3.98445451e+00,  1.04216804e+01],
       [ 6.17910867e+00,  8.56266823e+00],
       [ 6.54424581e+00,  1.12030935e+01],
       [-3.33701827e-01,  1.49177739e+00],
       [ 4.61924499e+00,  1.06688501e+01],
       [-1.80490785e+00, -9.00955401e-01],
       [-6.54285072e-01,  4.13218656e-02],
       [ 4.54357866e+00, -1.44062771e+00],
       [ 2.32553079e+00, -3.49388364e+00],
       [ 2.32200665e+00, -2.65079885e+00],
       [ 4.22209070e+00, -3.40217120e+00],
       [-4.10185368e-03,  8.04470900e-01],
       [-1.75242540e+00, -1.46338628e-01],
       [ 4.57597383e+00,  1.13827055e+01],
       [ 6.25956191e+00,  1.00522609e+01],
       [ 7.74394867e-01,  7.28045067e-01],
       [ 1.05017427e+00, -3.15573596e-02],
       [-2.39055312e-01,  1.72889175e-01],
       [ 1.26982518e+00, -3.43627806e+00],
       [ 4.

In [9]:
Z

array([0, 1, 1, 1, 2, 2, 2, 0, 2, 0, 0, 1, 1, 1, 1, 0, 0, 2, 2, 0, 0, 0,
       1, 1, 1, 2, 2, 2, 2, 0, 0, 0, 0, 0, 2, 0, 0, 1, 2, 1, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 2, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2,
       2, 0, 0, 2, 1, 1, 1, 1, 1, 0, 0, 0])

# Happy Grumpy Sunny Rainy Example

https://www.youtube.com/watch?v=kqSzLo9fenk&t=431s

In [10]:
lr = hmm.GaussianHMM(n_components=3, covariance_type="diag", init_params="cm", params="cmt")
lr.startprob_ = np.array([0.67, 0.33])
#                          S    R
lr.transmat_ = np.array([[0.8, 0.4],  # S
                         [0.2, 0.6]]) # R



In [11]:
X = np.array([["S", "S", "S", "S", "R", "R", "R", "S", "S", "S", "S", "R", "R", "S", "S"]])

In [12]:
remodel = hmm.GaussianHMM(n_components=2, covariance_type="full", n_iter=100)
remodel
# remodel.fit(X)
#Z2 = remodel.predict(X)

GaussianHMM(algorithm='viterbi', covariance_type='full', covars_prior=0.01,
      covars_weight=1, init_params='stmc', means_prior=0, means_weight=0,
      min_covar=0.001, n_components=2, n_iter=100, params='stmc',
      random_state=None, startprob_prior=1.0, tol=0.01, transmat_prior=1.0,
      verbose=False)

# Stock price prediction using Hidden Markov Model
https://rubikscode.net/2018/10/29/stock-price-prediction-using-hidden-markov-model/

In [13]:
import pandas as pd
import logging
from sklearn.model_selection import train_test_split
from hmmlearn.hmm import GaussianHMM

DATA_DIR = "/home/tales/dev/stock_data/data/"

class StockPredictor(object):
    def __init__(self, company, n_latency_days=10, n_hidden_states=4):
        self._init_logger()
 
        self.company = company
        self.n_latency_days = n_latency_days

        self.hmm = GaussianHMM(n_components=n_hidden_states)
        
        self.data = pd.read_csv(
            DATA_DIR + '/company_data/{company}.csv'.format(company=self.company))
        
 
    def _init_logger(self):
        self._logger = logging.getLogger(__name__)
        handler = logging.StreamHandler()
        formatter = logging.Formatter(
            '%(asctime)s %(name)-12s %(levelname)-8s %(message)s')
        handler.setFormatter(formatter)
        self._logger.addHandler(handler)
        self._logger.setLevel(logging.DEBUG)
 
    @staticmethod
    def _extract_features(data):
        open_price = np.array(data['open'])
        close_price = np.array(data['close'])
        high_price = np.array(data['high'])
        low_price = np.array(data['low'])
 
        # Compute the fraction change in close, high and low prices
        # which would be used a feature
        frac_change = (close_price - open_price) / open_price
        frac_high = (high_price - open_price) / open_price
        frac_low = (open_price - low_price) / open_price
 
        return np.column_stack((frac_change, frac_high, frac_low))

    def _split_train_test_data(self, test_size):
        data = pd.read_csv(
            DATA_DIR + '/company_data/{company}.csv'.format(company=self.company))
        _train_data, test_data = train_test_split(
            data, test_size=test_size, shuffle=False)
 
        self._train_data = _train_data
        self._test_data = test_data

    def fit(self):
        self._logger.info('>>> Extracting Features')
        feature_vector = StockPredictor._extract_features(self.data)
        self._logger.info('Features extraction Completed <<<')
 
        self.hmm.fit(feature_vector)
 

In [14]:
# Predictor for GOOGL stocks
stock_predictor = StockPredictor(company='GOOGL')
stock_predictor.data

FileNotFoundError: File b'/home/tales/dev/stock_data/data//company_data/GOOGL.csv' does not exist

# Markov Chains with Python

https://medium.com/@__amol__/markov-chains-with-python-1109663f3678

In [15]:
import numpy as np
 
class MarkovChain(object):
    
    def __init__(self, transition_prob):
        """
        Initialize the MarkovChain instance.
 
        Parameters
        ----------
        transition_prob: dict
            A dict object representing the transition 
            probabilities in Markov Chain. 
            Should be of the form: 
                {'state1': {'state1': 0.1, 'state2': 0.4}, 
                 'state2': {...}}
        """
        self.transition_prob = transition_prob
        self.states = list(transition_prob.keys())
 
    def next_state(self, current_state):
        """
        Returns the state of the random variable at the next time 
        instance.
 
        Parameters
        ----------
        current_state: str
            The current state of the system.
        """
        return np.random.choice(
            self.states, 
            p=[self.transition_prob[current_state][next_state] 
               for next_state in self.states]
        )
 
    def generate_states(self, current_state, no=10):
        """
        Generates the next states of the system.
 
        Parameters
        ----------
        current_state: str
            The state of the current random variable.
 
        no: int
            The number of future states to generate.
        """
        future_states = []
        for i in range(no):
            next_state = self.next_state(current_state)
            future_states.append(next_state)
            current_state = next_state
        return future_states

In [16]:
transition_prob = {'Sunny': {'Sunny': 0.8, 'Rainy': 0.2},
                   'Rainy': {'Sunny': 0.4, 'Rainy': 0.6}}

In [17]:
weather_chain = MarkovChain(transition_prob=transition_prob)
weather_chain.states

['Sunny', 'Rainy']

In [18]:
weather_chain.next_state(current_state='Sunny')

'Rainy'

In [None]:
weather_chain.next_state(current_state='Rainy')

In [None]:
f = weather_chain.generate_states(current_state='Sunny', no=15)
f

In [None]:
X = np.array([["Sunny", "Sunny", "Sunny", "Sunny", "Rainy", "Rainy", "Rainy", "Sunny", "Sunny", "Sunny", "Sunny", "Rainy", "Rainy", "Sunny", "Sunny"]])

In [None]:
a = ["Sunny", "Sunny", "Sunny", "Sunny", "Rainy", "Rainy", "Rainy", "Sunny", "Sunny", "Sunny", "Sunny", "Rainy", "Rainy", "Sunny", "Sunny"]
a

In [None]:
len(a)

In [None]:
import os
os.chdir("/home/tales/dev/master/mdc_analysis/")
print("working dir", os.getcwd())

from src.dao import csv_dao
from src.entity.stop_region import StopRegionGroup

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
user_id = 6070
sr_list = csv_dao.stop_region_sequence(user_id)

In [None]:
stop_regions_group = StopRegionGroup(sr_list, agglutinate_stop_regions=True)
stop_regions_group.size()

In [None]:
sequence_tags = stop_regions_group.sequence_stop_region_tags()
print(len(sequence_tags))
sequence_tags

In [None]:
def transition_probabilities(sequence_states, round_proba=4):
    trans_proba_df = pd.DataFrame()
    
    trans_proba_df["origin"] = sequence_states[0:-1]
    trans_proba_df["origin"] = trans_proba_df["origin"].astype(str)
    
    trans_proba_df["destination"] = sequence_states[1:]
    trans_proba_df["destination"] = trans_proba_df["destination"].astype(str)
    
    trans_proba_df["transition"] = trans_proba_df["origin"].astype(str) + " > " + trans_proba_df["destination"].astype(str)
    trans_proba_df = trans_proba_df.set_index(trans_proba_df["transition"], drop=False)
    
    trans_freq_df = trans_proba_df["transition"].value_counts().to_frame()
    trans_freq_df = trans_freq_df.rename(index=str, columns={"transition": "transition_freq"})
    
    trans_proba_df = trans_proba_df.merge(trans_freq_df, left_index=True, right_index=True).reset_index(drop=True).drop_duplicates()
    del trans_proba_df["transition"]
    
    trans_proba_df["transition_freq"] = trans_proba_df["transition_freq"] / trans_proba_df["transition_freq"].sum()
    trans_proba_df["transition_freq"] = trans_proba_df["transition_freq"].round(round_proba)
    
    return trans_proba_df

def to_dict(trans_proba_df):
    #maybe using pandas pivot table improve time execution 
    trans_proba_dict = {}
    trans_proba_df.apply(lambda row : add_value(row, trans_proba_dict), axis=1)
    
    return trans_proba_dict 
    
def add_value(row, trans_proba_dict):
    outter_key = row["origin"]
    inner_key = row["destination"]
    value = row["transition_freq"]
    
    if not outter_key in trans_proba_dict.keys():
        trans_proba_dict[outter_key] = {inner_key: value}
    else:
        trans_proba_dict[outter_key][inner_key] = value
        
    return trans_proba_dict
    

In [None]:
tp = transition_probabilities(sequence_tags.tolist())
tp
tp_dict = to_dict(tp)

In [None]:
destination_chain = MarkovChain(transition_prob=tp_dict)
destination_chain.states