# pip install river

In [14]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import river #ML package


In [15]:
# Load Estimators
from river.linear_model import LogisticRegression
from river.naive_bayes import MultinomialNB
from river.feature_extraction import BagOfWords,TFIDF

#### Requirement
+ list of tuple
+ dictionary
+ CSV
    - list of tuples or dictionary record
    - iter_csv
    - iter_pandas

In [16]:
### Data: Predict if a text if hardware or software related
data = [("my unit test failed","software"),
("tried the program, but it was buggy","software"),
("i need a new power supply","hardware"),
("the drive has a 2TB capacity","hardware"),
("unit-tests","software"),
("program","software"),
("power supply","hardware"),
("drive","hardware"),
("it needs more memory","hardware"),
("check the API","software"),
("design the API","software"),
("they need more CPU","hardware"),
("code","software"),
("i found some bugs in the code","software"),
("i swapped the memory","hardware"),
("i tested the code","software")]

test_data = [('he writes code daily','software'), 
             ('the disk is faulty','hardware'), 
             ("refactor the code","software"),
             ('no empty space on the drive','hardware')]

### Text classification
* vectorized the text 
* *  CountVectorizer/ BagOfWords
* *  TFIDF
* build model on the go

# Making the pipeline

In [17]:
#  Make a Pipeline
from river.compose import Pipeline

In [18]:
pipe_nb = Pipeline(('vectorizer',BagOfWords(lowercase=True)),('nb',MultinomialNB()))
pipe_nb

In [19]:
# Get steps
pipe_nb.steps

OrderedDict([('vectorizer',
              BagOfWords (
                on=None
                strip_accents=True
                lowercase=True
                preprocessor=None
                tokenizer=<built-in method findall of re.Pattern object at 0x7f52bfc50d30>
                ngram_range=(1, 1)
              )),
             ('nb',
              MultinomialNB (
                alpha=1.
              ))])

In [20]:
# Fit on our data
# Learn one at a time
# learn_one(for river)/ fit_one(for creme)
# predict_one

for text,label in data:
    #print(label)
    pipe_nb = pipe_nb.learn_one(text,label)

In [21]:
pipe_nb

In [22]:
# Make Prediction
pipe_nb.predict_one("I built an API")
pipe_nb.predict_proba_one("I built an API")

{'software': 0.732646964375691, 'hardware': 0.2673530356243093}

In [23]:
pipe_nb.predict_one("the hard drive  in the computer is damaged")
#pipe_nb.predict_proba_one("the hard drive  in the computer is damaged")

'software'

# Evaluate & Classification


In [24]:
#test_data
y_pred = []
for x,y in test_data:
    print(x)
    res = pipe_nb.predict_one(x)
    y_pred.append(res)

he writes code daily
the disk is faulty
refactor the code
no empty space on the drive


In [25]:
# Classification
from river.metrics import ClassificationReport
report = ClassificationReport()

In [26]:
y_pred = []
y_test = []
for x,y in test_data:
    print(x)
    res = pipe_nb.predict_one(x)
    y_pred.append(res)
    y_test.append(y)
    print("    prediction: ",res)
    print("    true value: ",y)


he writes code daily
    prediction:  software
    true value:  software
the disk is faulty
    prediction:  software
    true value:  hardware
refactor the code
    prediction:  software
    true value:  software
no empty space on the drive
    prediction:  hardware
    true value:  hardware


# evaluation

In [27]:
for yt,yp in zip(y_test,y_pred):
    report = report.update(yt,yp)
    
report

           Precision   Recall    F1       Support  
                                                   
hardware     100.00%    50.00%   66.67%         2  
software      66.67%   100.00%   80.00%         2  
                                                   
   Macro      83.33%    75.00%   73.33%            
   Micro      75.00%    75.00%   75.00%            
Weighted      83.33%    75.00%   73.33%            

                  75.00% accuracy                  

# Update the Model on the test data & Check Accuracy


In [28]:
metric = river.metrics.Accuracy()
for text,label in test_data:
#     print(label)
    y_pred_before = pipe_nb.predict_one(text)
    metric = metric.update(label,y_pred_before)
    # Has already learnt the pattern
    pipe_nb = pipe_nb.learn_one(text,label)

In [29]:
metric

Accuracy: 75.00%

# Update the Model & Check Accuracy


In [30]:
# On the train data: 100%
metric2 = river.metrics.Accuracy()
for text,label in data:
#     print(label)
    y_pred_before = pipe_nb.predict_one(text)
    metric2 = metric2.update(label,y_pred_before)
    pipe_nb = pipe_nb.learn_one(text,label)

metric2

Accuracy: 100.00%