# Reproduce results of trained CNN & CNN-Attention models 

In [15]:
import json
from collections import Counter
from sklearn.model_selection import train_test_split
import kashgari
from kashgari.tasks.classification import CNN_Attention_Model, CNN_Model

# load mixed-topic dataset with opinion/non-opinion annotation
with open('3_in_1/cnn_x.json', 'r', encoding='utf-8') as f:
    store_x = json.load(f)
with open('3_in_1/cnn_y.json', 'r', encoding='utf-8') as f:
    store_y = json.load(f)
with open('3_in_1/testcnn_x.json', 'r', encoding='utf-8') as f:
    test_x = json.load(f)
with open('3_in_1/testcnn_y.json', 'r', encoding='utf-8') as f:
    test_y = json.load(f)  

# split train & dev set
valid_x, train_x, valid_y, train_y = train_test_split(store_x, store_y, test_size=0.95, random_state=42)

print(f"train sample: {len(train_x)}")
print(f"dev sample: {len(valid_x)}")
print(f"test sample: {len(test_x)}")
print(Counter(train_y))
print(Counter(valid_y))
print(Counter(test_y))

load_CNN = CNN_Model.load_model('CNN_94_95_93')
load_CNN.evaluate(test_x, test_y)

train sample: 19952
dev sample: 1050
test sample: 1105
Counter({'non-opinion': 11746, 'opinion': 8206})
Counter({'non-opinion': 612, 'opinion': 438})
Counter({'non-opinion': 642, 'opinion': 463})


2022-06-22 09:43:09,567 [DEBUG] kashgari - ------------------------------------------------
2022-06-22 09:43:09,568 [DEBUG] kashgari - Loaded gensim word2vec model's vocab
2022-06-22 09:43:09,568 [DEBUG] kashgari - model        : mat_embedding3/vectors.txt
2022-06-22 09:43:09,569 [DEBUG] kashgari - word count   : 529690
2022-06-22 09:43:09,569 [DEBUG] kashgari - Top 50 words : ['the', 'of', '.', ',', 'and', '<nUm>', 'in', 'a', 'to', ')', '(', 'with', '-', 'for', 'is', 'by', 'on', 'was', 'at', 'were', 'that', '–', 'as', 'are', 'from', '/', 'an', 'temperature', 'surface', 'using', 'high', 'which', 'C', '°', 'this', '%', 'In', 'it', 'A', '=', 'structure', 'properties', ':', 'phase', 'results', 'effect', 'these', 'than', 'based', 'different']
2022-06-22 09:43:09,569 [DEBUG] kashgari - ------------------------------------------------
2022-06-22 09:43:10,993 [DEBUG] kashgari - predict input shape (1105, 282) x: 
[[   0    1    1 ...    0    0    0]
 [   0    1 3255 ...    0    0    0]
 [   0

              precision    recall  f1-score   support

 non-opinion     0.9446    0.9564    0.9505       642
     opinion     0.9385    0.9222    0.9303       463

    accuracy                         0.9421      1105
   macro avg     0.9415    0.9393    0.9404      1105
weighted avg     0.9420    0.9421    0.9420      1105



{'detail': {'non-opinion': {'precision': 0.9446153846153846,
   'recall': 0.956386292834891,
   'f1-score': 0.9504643962848297,
   'support': 642},
  'opinion': {'precision': 0.9384615384615385,
   'recall': 0.9222462203023758,
   'f1-score': 0.9302832244008714,
   'support': 463},
  'accuracy': 0.9420814479638009,
  'macro avg': {'precision': 0.9415384615384615,
   'recall': 0.9393162565686334,
   'f1-score': 0.9403738103428505,
   'support': 1105},
  'weighted avg': {'precision': 0.9420368952314654,
   'recall': 0.9420814479638009,
   'f1-score': 0.9420083939479313,
   'support': 1105}},
 'precision': 0.9420368952314654,
 'recall': 0.9420814479638009,
 'f1-score': 0.9420083939479313,
 'support': 1105}

In [16]:
# load mixed-topic dataset with opportunity(driver)/challenges(barrier) annotation
with open('3_in_1/cnnatt_x.json', 'r', encoding='utf-8') as f:
    x_data = json.load(f)
with open('3_in_1/cnnatt_y.json', 'r', encoding='utf-8') as f:
    y_data = json.load(f)
with open('3_in_1/testcnnatt_x.json', 'r', encoding='utf-8') as f:
    test_x = json.load(f)
with open('3_in_1/testcnnatt_y.json', 'r', encoding='utf-8') as f:
    test_y = json.load(f)
    
# add augmented data by SMOTE
with open('3_in_1/augmented_barrier.json') as file_obj:
    barrier = json.load(file_obj)
    length = len(barrier)
    x_data.extend(barrier)
    tmp = ['barrier']*length
    y_data.extend(tmp)

# split train & dev set
valid_x, train_x, valid_y, train_y = train_test_split(x_data, y_data, stratify=y_data, test_size=0.91, random_state=42)

print(f"train sample: {len(train_x)}")
print(f"dev sample: {len(valid_x)}")
print(f"test sample: {len(test_x)}")
print(Counter(train_y))
print(Counter(valid_y))
print(Counter(test_y))

load_CNN_Attention = CNN_Attention_Model.load_model('CNN_Attention_91')
load_CNN_Attention.evaluate(test_x, test_y)

train sample: 9224
dev sample: 912
test sample: 950
Counter({'driver': 6337, 'barrier': 2887})
Counter({'driver': 627, 'barrier': 285})
Counter({'driver': 774, 'barrier': 176})


2022-06-22 09:44:50,633 [DEBUG] kashgari - ------------------------------------------------
2022-06-22 09:44:50,634 [DEBUG] kashgari - Loaded gensim word2vec model's vocab
2022-06-22 09:44:50,634 [DEBUG] kashgari - model        : mat_embedding3/vectors.txt
2022-06-22 09:44:50,634 [DEBUG] kashgari - word count   : 529690
2022-06-22 09:44:50,635 [DEBUG] kashgari - Top 50 words : ['the', 'of', '.', ',', 'and', '<nUm>', 'in', 'a', 'to', ')', '(', 'with', '-', 'for', 'is', 'by', 'on', 'was', 'at', 'were', 'that', '–', 'as', 'are', 'from', '/', 'an', 'temperature', 'surface', 'using', 'high', 'which', 'C', '°', 'this', '%', 'In', 'it', 'A', '=', 'structure', 'properties', ':', 'phase', 'results', 'effect', 'these', 'than', 'based', 'different']
2022-06-22 09:44:50,635 [DEBUG] kashgari - ------------------------------------------------
2022-06-22 09:44:52,416 [DEBUG] kashgari - predict input shape (950, 171) x: 
[[   0    1   18 ...    0    0    0]
 [   0    1  174 ...    0    0    0]
 [   0 

              precision    recall  f1-score   support

     barrier     0.7727    0.7727    0.7727       176
      driver     0.9483    0.9483    0.9483       774

    accuracy                         0.9158       950
   macro avg     0.8605    0.8605    0.8605       950
weighted avg     0.9158    0.9158    0.9158       950



{'detail': {'barrier': {'precision': 0.7727272727272727,
   'recall': 0.7727272727272727,
   'f1-score': 0.7727272727272727,
   'support': 176},
  'driver': {'precision': 0.9483204134366925,
   'recall': 0.9483204134366925,
   'f1-score': 0.9483204134366925,
   'support': 774},
  'accuracy': 0.9157894736842105,
  'macro avg': {'precision': 0.8605238430819826,
   'recall': 0.8605238430819826,
   'f1-score': 0.8605238430819826,
   'support': 950},
  'weighted avg': {'precision': 0.9157894736842105,
   'recall': 0.9157894736842105,
   'f1-score': 0.9157894736842105,
   'support': 950}},
 'precision': 0.9157894736842105,
 'recall': 0.9157894736842105,
 'f1-score': 0.9157894736842105,
 'support': 950}

# Apply opinion mining to plain text

In [36]:
text = 'Owing to the importance of surface passivation to BSi, major passivation techniques using SiNx, thermal oxide, Al2O3 and a-Si have been critically examined. It is found that atomic layer deposited Al2O3 offers excellent surface conformality and passivation to the silicon surface, especially on p+-emitters. With ALD Al2O3 passivation, a record high 18.7% efficient BSi solar cell has been successfully fabricated. As the market share of n-type solar cells (with p+-emitters) is expected to rise in the near future, this passivation technique is particularly attractive and may become a new industry standard.'

In [37]:
import nltk # if no nltk, use split('. ')

# 1. segmentation + tokenization
def sen_seg(data): 
    sens = []
    to_replace = ['et al. ', 'Fig. ', 'e.g. ', 'i.e. ', 'Ref. ', 'Figs. ', ' ca. ', 'approx. ', '(ca. ', 'etc.) ']
    for tr in to_replace:
        data = data.replace(tr, tr[:-2]+'####@')
    tmp = nltk.sent_tokenize(data)
    # tmp = data.split('. ')
    for i, t in enumerate(tmp):
        for tr in to_replace:
            t = t.replace(tr[:-2]+'####@', tr)
        tmp[i] = t
    for t in tmp:
        sens.append(nltk.word_tokenize(t))
        # sens.append(t.split(' '))
    return sens

sens = sen_seg(text)
for t in sens:
    print(t)

['Owing', 'to', 'the', 'importance', 'of', 'surface', 'passivation', 'to', 'BSi', ',', 'major', 'passivation', 'techniques', 'using', 'SiNx', ',', 'thermal', 'oxide', ',', 'Al2O3', 'and', 'a-Si', 'have', 'been', 'critically', 'examined', '.']
['It', 'is', 'found', 'that', 'atomic', 'layer', 'deposited', 'Al2O3', 'offers', 'excellent', 'surface', 'conformality', 'and', 'passivation', 'to', 'the', 'silicon', 'surface', ',', 'especially', 'on', 'p+-emitters', '.']
['With', 'ALD', 'Al2O3', 'passivation', ',', 'a', 'record', 'high', '18.7', '%', 'efficient', 'BSi', 'solar', 'cell', 'has', 'been', 'successfully', 'fabricated', '.']
['As', 'the', 'market', 'share', 'of', 'n-type', 'solar', 'cells', '(', 'with', 'p+-emitters', ')', 'is', 'expected', 'to', 'rise', 'in', 'the', 'near', 'future', ',', 'this', 'passivation', 'technique', 'is', 'particularly', 'attractive', 'and', 'may', 'become', 'a', 'new', 'industry', 'standard', '.']


In [38]:
# 2. apply opinion extraction
find_opinion = []
result = load_CNN.predict(sens)
for i, tx in enumerate(sens):
    if result[i] == 'opinion':
        find_opinion.append(tx)
        
print(len(find_opinion), 'opinions is found by CNN')
print(find_opinion)

2022-06-22 19:22:43,680 [DEBUG] kashgari - predict input shape (4, 37) x: 
[[    0     1    12     4  1901     5    32  1842    12 26580     7   761
   1842   371    33  7117     7   162   166     7   416     8  2050   117
   2338 10596   426     6     0     0     0     0     0     0     0     0
      0]
 [    0     1    18    71    24   474    72   255   416  2770   603    32
  45533     8  1842    12     4   293    32     7   766    20     1     6
      0     0     0     0     0     0     0     0     0     0     0     0
      0]
 [    0     1  3575   416  1842     7    11  2342    34     1    39   621
  26580   643   391   136  2338   886   347     6     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0]
 [    0   302     4  5199  7963     5     1   643   589    14    15     1
     13    18   785    12  1874    10     4   312  1508     7    38  1842
    203    18  1392  2684     8   797  1751    11   126  2916   800     6
      0]]


2 opinions is found by CNN
[['It', 'is', 'found', 'that', 'atomic', 'layer', 'deposited', 'Al2O3', 'offers', 'excellent', 'surface', 'conformality', 'and', 'passivation', 'to', 'the', 'silicon', 'surface', ',', 'especially', 'on', 'p+-emitters', '.'], ['As', 'the', 'market', 'share', 'of', 'n-type', 'solar', 'cells', '(', 'with', 'p+-emitters', ')', 'is', 'expected', 'to', 'rise', 'in', 'the', 'near', 'future', ',', 'this', 'passivation', 'technique', 'is', 'particularly', 'attractive', 'and', 'may', 'become', 'a', 'new', 'industry', 'standard', '.']]


## Optional corpus comparison method to add/remove opinions to certain ratio
### Note: If you don't need this part, just skip it and run next cell

In [42]:
# optional function: corpus comparison to adjust number of opinions
def rank_opinion_by_lexicon(sens, lexicon_file):
    words_md = ['could', 'may', 'would', 'must', 'might', 'shall', 'ought', 'can']
    score_dic = {}
    with open(lexicon_file) as file_obj:
        pri_list = json.load(file_obj)
    for i, s in enumerate(sens):
        score = 0
        for ss in s:
            if ss.lower() in pri_list or ss in pri_list:
                score += 1
            if ss.lower() in words_md:
                score += 2
        score_dic[i] = score
    
    sorted_dic = sorted(score_dic.items(), key=lambda item:item[1], reverse=True)
    rank_opinion = []
    for sd in sorted_dic:
        rank_opinion.append(sens[sd[0]])
    return rank_opinion


RATE = 0.25 # a threshold which can be set by user, if whole page, 0.1-0.2 is rather suitable
support_opinion = rank_opinion_by_lexicon(sens, 'final_200.json')
need_length = int(RATE * len(sens))
count = 0
tmp = []
if len(find_opinion) >= need_length:
    print('Too many opinions, so rank & remove low score opinion')
    for so in support_opinion:
        if so in find_opinion:
            tmp.append(so)
            if len(tmp)==need_length:
                break
else: 
    print('Too few opinions, add high score candidate')
    rest = [x for x in support_opinion if x not in find_opinion]
    tmp.extend(find_opinion)
    tmp.extend(rest[:need_length-len(find_opinion)])

find_opinion = tmp
        
print(len(find_opinion), 'opinions found in total by', RATE, '*', str(len(sens)))
print(find_opinion)

Too many opinions, so rank & remove low score opinion
1 opinions found in total by 0.25 * 4
[['As', 'the', 'market', 'share', 'of', 'n-type', 'solar', 'cells', '(', 'with', 'p+-emitters', ')', 'is', 'expected', 'to', 'rise', 'in', 'the', 'near', 'future', ',', 'this', 'passivation', 'technique', 'is', 'particularly', 'attractive', 'and', 'may', 'become', 'a', 'new', 'industry', 'standard', '.']]


In [43]:
# 2. apply opinion classification
find_opps = []
find_chas = []
result = load_CNN_Attention.predict(find_opinion)
for i, tx in enumerate(find_opinion):
    if result[i] == 'driver':
        find_opps.append(tx)
    else:
        find_chas.append(tx)
        
print(len(find_opps), 'opportunities are found by CNN-Attention.')
for fo in find_opps:
    print(' '.join(fo))
print('\n')
print(len(find_chas), 'challenges are found by CNN-Attention.')
for fc in find_chas:
    print(' '.join(fc))

2022-06-22 19:30:17,263 [DEBUG] kashgari - predict input shape (1, 37) x: 
[[   0  302    4 5199 7963    5    1  643  589   14   15    1   13   18
   785   12 1874   10    4  312 1508    7   38 1842  203   18 1392 2684
     8  797 1751   11  126 2916  800    6    0]]
2022-06-22 19:30:17,305 [DEBUG] kashgari - predict output shape (1, 2)
2022-06-22 19:30:17,305 [DEBUG] kashgari - predict output argmax: [0]


1 opportunities are found by CNN-Attention.
As the market share of n-type solar cells ( with p+-emitters ) is expected to rise in the near future , this passivation technique is particularly attractive and may become a new industry standard .


0 challenges are found by CNN-Attention.
