In [3]:
# Importing metadata zip file and converting it to dataframe (based on original code provided by the datasource )
import json
import gzip
import pandas as pd

def getDF(path):
  # Unzip the file, load in each line as an object
  g = gzip.open(path, 'rb')
  g = [json.loads(l) for l in g]

  # Map to a dictionary, then load in as a dataframe
  dict_df = {i: d for (i, d) in enumerate(g)}
  return pd.DataFrame.from_dict(dict_df, orient='index')

df = getDF('meta_ALL_Beauty.json.gz')

In [4]:
# Basic cleaning steps

# Selecting only the columns that are required for analysis
description_and_asin = ["description","asin"]
df = df[description_and_asin]

# Converting descriptions to a single sting because descriptions are in a list of strings, and removing \n
df["description_str"] = df.description.map(lambda x: ".".join(x).replace("\n",""))

# Remove empty or null descriptions
df = df.drop(df[ ( (df["description_str"] == "" ) | ( df["description_str"].isnull()) ) ].index)
#15108


In [5]:
# Remove duplicate asin and descriptions so can have only unique products for classifier
df[df.duplicated("asin")]
df = df.drop_duplicates("asin", keep="last")

df[df.duplicated("description_str")]
df = df.drop_duplicates("description_str", keep="last")

# Removing redundant old description column
description_str_and_asin = ["description_str","asin"]
df = df[description_str_and_asin]
df.describe() #13743


Unnamed: 0,description_str,asin
count,13743,13743
unique,13743,13743
top,Loud 'N Clear Personal Sound Amplifier allows ...,6546546450
freq,1,1


In [6]:
# Used later for write up
'''# Identify range of descriptions in after duplicates removal
df_descriptions_without_empty = df["description_str"].apply(lambda x: len(x.split()))
df_descriptions_without_empty.describe([0.1,0.15,0.20,0.25,0.30,0.75,0.85,0.90,0.92,0.95,0.97,0.98,0.99])
# 97% upper limit of 259 (same % milit as reviews)
# 11 words (same numbe of words as reviews) 20% because no assuption is made on how much more each one is informative as well as descriptions 
# being a smaller sample
#13743'''


'# Identify range of descriptions in after duplicates removal\ndf_descriptions_without_empty = df["description_str"].apply(lambda x: len(x.split()))\ndf_descriptions_without_empty.describe([0.1,0.15,0.20,0.25,0.30,0.75,0.85,0.90,0.92,0.95,0.97,0.98,0.99])\n# 97% upper limit of 259 (same % milit as reviews)\n# 11 words (same numbe of words as reviews) 20% because no assuption is made on how much more each one is informative as well as descriptions \n# being a smaller sample\n#13743'

In [7]:
# Remove descriptions within set limit of words

# Split descriptions at white spaces
df["num_words_description"] = df["description_str"].apply(lambda x: len(x.split()))

# Remove descriptions that don't meet this criteria
df = df[(df["num_words_description"] <= 259) & (df["num_words_description"] >= 14)]

df["description_str"].describe() #9940

count                                                  9940
unique                                                 9940
top       Loud 'N Clear Personal Sound Amplifier allows ...
freq                                                      1
Name: description_str, dtype: object

In [8]:
# Later used for write up
'''df_descriptions_without_empty = df["description_str"].apply(lambda x: len(x.split()))
df_descriptions_without_empty.describe([0.1,0.15,0.20,0.25,0.30,0.75,0.85,0.90,0.92,0.95,0.97,0.98,0.99])'''

'df_descriptions_without_empty = df["description_str"].apply(lambda x: len(x.split()))\ndf_descriptions_without_empty.describe([0.1,0.15,0.20,0.25,0.30,0.75,0.85,0.90,0.92,0.95,0.97,0.98,0.99])'

In [9]:
# To csv for sense checking - TODO later will be removed
df.to_csv("Descriptions before cleaning.csv")

In [10]:
# Sense check - TODO later will be removed
df 

Unnamed: 0,description_str,asin,num_words_description
0,Loud 'N Clear Personal Sound Amplifier allows ...,6546546450,37
2,No7 Stay Perfect Foundation now stays perfect ...,7250468162,96
4,Lacto Calamine Skin Balance Daily Nourishing L...,7414204790,14
7,"According to the legend, in 1613, two brothers...",8279996397,77
8,Novi prevod proslavljene knjige Zadruga objavl...,8637910351,53
...,...,...,...
32879,Theres no finer way for a chap to get ready fo...,B01HIH2QTU,21
32880,Move over soap on a rope! This heavy-duty Bric...,B01HIHLFOC,18
32884,Eau de parfum spray vial mini design house: yv...,B01HIPOQ2M,21
32885,Pokemon Plush 9.2 Inch / 23cm Gengar Doll Stuf...,B01HIUEEHO,15


In [11]:
# Regrex for removing characters
import re

# ----------------------------- Check which bits i needs still TODO
# Spacy for spell check
import spacy
import contextualSpellCheck
nlp = spacy.load("en_core_web_sm")
contextualSpellCheck.add_to_pipe(nlp)

# Spacy stop word creation
stopping_words = spacy.lang.en.stop_words.STOP_WORDS
stopping_words_new = stopping_words
# -----------------------------
print(stopping_words)
print(len(stopping_words))

# NLTK for tokenization and lemmatization
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')


{'forty', '‘s', 'off', 'around', 'further', 'same', 'throughout', 'amongst', 'does', 'hereupon', 'front', 'being', 'became', 'someone', 'whereas', 'again', 'although', 'will', 'seeming', 'anyhow', 'own', 'amount', 'become', 'however', 'would', 'nor', 'bottom', 'sometimes', 'often', "'m", 'over', 'rather', 'very', 'please', '‘m', 'any', 'because', 'if', 'twelve', '‘re', 'via', '’ve', 'within', 'even', 'really', 'full', 'becoming', 'first', 'along', 'regarding', 'fifteen', 'another', 'show', 'nobody', 'most', 'whom', 'on', 'top', 'there', 'out', 'whither', 'hereby', 'an', 'serious', 'in', 'never', 'enough', 'himself', 'whereupon', 'something', 'here', '‘ve', 'neither', 'are', 'between', 'less', 'seemed', 'onto', 'every', 'across', 'three', 'call', 'everywhere', 'also', 'after', 'anyway', "'s", 'perhaps', 'per', 'anyone', 'of', 'not', 'than', 'thus', 'their', 'too', 'how', 'these', '‘d', 'beyond', 'several', 'is', 'had', 'afterwards', 'just', 'unless', 'latterly', 'n‘t', 'alone', 'hence',

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [12]:
# Removing certain stop words from the list of stop words because they are part of our seed terms
stopping_words_to_keep = ['no']
for word in list(stopping_words_new):
    if word in stopping_words_to_keep:
        stopping_words_new.remove(word)
print(stopping_words_new)
print(len(stopping_words_new))


{'forty', '‘s', 'off', 'around', 'further', 'same', 'throughout', 'amongst', 'does', 'hereupon', 'front', 'being', 'became', 'someone', 'whereas', 'again', 'although', 'will', 'seeming', 'anyhow', 'own', 'amount', 'become', 'however', 'would', 'nor', 'bottom', 'sometimes', 'often', "'m", 'over', 'rather', 'very', 'please', '‘m', 'any', 'because', 'if', 'twelve', '‘re', 'via', '’ve', 'within', 'even', 'really', 'full', 'becoming', 'first', 'along', 'regarding', 'fifteen', 'another', 'show', 'nobody', 'most', 'whom', 'on', 'top', 'there', 'out', 'whither', 'hereby', 'an', 'serious', 'in', 'never', 'enough', 'himself', 'whereupon', 'something', 'here', '‘ve', 'neither', 'are', 'between', 'less', 'seemed', 'onto', 'every', 'across', 'three', 'call', 'everywhere', 'also', 'after', 'anyway', "'s", 'perhaps', 'per', 'anyone', 'of', 'not', 'than', 'thus', 'their', 'too', 'how', 'these', '‘d', 'beyond', 'several', 'is', 'had', 'afterwards', 'just', 'unless', 'latterly', 'n‘t', 'alone', 'hence',

In [13]:
# Preprocessing of keywords

def preprocessing_keywords(raw_string):
    
    # Remove html tags and anything inside them 
    no_html = re.sub(r'<[^>]*>','', raw_string)

    # Make everything lowercase
    lowercase_column = no_html.lower()

    # TODO have a look at apostrophe again
    # Remove apostrophe 
    without_apostrophe = re.sub(r'[\']', '', lowercase_column)
    
    # Remove all non alphabetic instances that aren't a space and replace them with a space 
    alphabetic_keywords = re.sub(r'[^a-z\s]', ' ', without_apostrophe)

    # Tokenize string into individual words
    tokens = word_tokenize(alphabetic_keywords) 
    
    # Remove stopping words
    tokens_without_stopping_words = [token for token in tokens if token not in stopping_words_new]

    # Lemmatize tokens using nltk and join them into phrases
    sentances_without_stop_words = ' '.join([lemmatizer.lemmatize(t) for t in tokens_without_stopping_words])

    return sentances_without_stop_words


In [14]:
###############################################################
#                      Cleaning Keywords                                

#ngram (1,2)
enviromental = ['recyclable', 'recycled', ' Environmentally friendly ','biodegradable', ' no packaging', 'sustainable', ' ecological ', 'plastic-free', 'compostable', 'renewable', 'reusable', 'biodegradable', 'organic', 'refillable', 'refills', 'recycled', 'reef safe',  'no oxybenzone ',  'triclosan-free', 'low-impact', 'soil association', 'conservation', 'COSMOS ', 'NATRUE ', 'FSC', 'eco']   


social = ['cruelty-free', 'equality', 'PETA', 'leaping bunny', 'fair trade', 'no animal', 'ethical ', ' Non-profit ', ' Donated ']  


economic = ['Fair trade', 'renewable', 'locally sourced', 'small business', 'recycled', ' Fair wage', 'community', ' Economic prosperity ', ' local ingredients ', 'local farmers ']   


health = ['non-toxic', 'no toxic', 'organic', 'paraben-free', 'triclosan-free', 'phthalates-free', 'non-nano', 'formaldehyde-free', 'non GMO', 'soil association', 'COSMOS ', 'NATRUE', 'USDA', ' Fragrance-free ', ' no fragrance ', ' sulfate free ']   



# Cleaning keywords/phrases 
enviro_p = list(set(preprocessing_keywords(phrase) for phrase in enviromental))
social_p = list(set(preprocessing_keywords(phrase) for phrase in social))
economic_p = list(set(preprocessing_keywords(phrase) for phrase in economic))
health_p = list(set(preprocessing_keywords(phrase) for phrase in health))

print(enviro_p)
print(social_p)
print(economic_p)
print(health_p)


['compostable', 'low impact', 'conservation', 'triclosan free', 'no packaging', 'fsc', 'plastic free', 'refillable', 'sustainable', 'cosmos', 'organic', 'recyclable', 'reusable', 'refill', 'renewable', 'recycled', 'environmentally friendly', 'eco', 'no oxybenzone', 'reef safe', 'soil association', 'biodegradable', 'natrue', 'ecological']
['non profit', 'leaping bunny', 'cruelty free', 'equality', 'no animal', 'donated', 'peta', 'ethical', 'fair trade']
['local farmer', 'community', 'economic prosperity', 'small business', 'fair wage', 'locally sourced', 'renewable', 'recycled', 'local ingredient', 'fair trade']
['paraben free', 'non toxic', 'formaldehyde free', 'organic', 'fragrance free', 'no toxic', 'sulfate free', 'phthalates free', 'non nano', 'triclosan free', 'soil association', 'no fragrance', 'usda', 'natrue', 'non gmo', 'cosmos']


In [15]:

# Spell check (creating dictionary)
import pkg_resources
from symspellpy import SymSpell, Verbosity

sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")
# term_index is the column of the term and count_index is the column of the term frequency
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

# Combining all keywords as individual words
anchor_words_combined = list(set([word for sublist in [enviro_p, social_p, economic_p, health_p] for phrase in sublist for word in phrase.split()]))

# Add all words to the dictionary
for word in anchor_words_combined:
    suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2)
    if len(suggestions) > 0 and suggestions[0].term == word:
        # the best suggestion for this word is itself - it must exist in the dictionary
        continue

    # add to the dictionary with frequency 1 
    sym_spell.create_dictionary_entry(word, 1)
    

In [16]:
# Preprocessing of descriptions

def preprocessing(raw_string):
    # Remove html tags and anything inside them 
    no_html = re.sub(r'<[^>]*>','', raw_string)

    # Make everything lowercase
    lowercase_column = no_html.lower()

    # Remove apostrophe for uniformity and to enable spell check to correct words with apostrophe
    without_apostrophe = re.sub(r'[\']', '', lowercase_column)

    # Remove all non alphabetic instances that aren't a space and replace them with a space 
    alphabetic_column = re.sub(r'[^a-z\s]', ' ', without_apostrophe)

    # Tokenize string into individual words
    tokens = word_tokenize(alphabetic_column) 
    
    # For each word we will have a list of suggestions
    spelling_suggestions = [sym_spell.lookup(x, Verbosity.CLOSEST, max_edit_distance=2) for x in tokens]
    # Drop those that have no suggestions and top[0] suggestions for those who do
    spelling_suggestions = [x[0].term for x in spelling_suggestions if len(x) > 0]
    
    # Remove stopping words 
    tokens_without_stopping_words = [token for token in spelling_suggestions if token not in stopping_words_new]

    # Lemmatize tokens (nltk)
    sentances_without_stop_words = ' '.join([lemmatizer.lemmatize(t) for t in tokens_without_stopping_words])

    return sentances_without_stop_words

In [17]:
# Clean descriptions
df["clean_description"] = df["description_str"].apply(preprocessing)
# Export to CSV for sense checking
df["clean_description"].to_csv("Clean Descriptions.csv")

df["clean_description"]

0        loud clear personal sound amplifier allows tur...
2        no stay perfect foundation stay perfect longer...
4        calamine skin balance daily nourishing lotion ...
7        according legend brother native origin black b...
8        nov period knife priv colour martini kay divot...
                               ...                        
32879    no finer way chap ready close gentleman hardwa...
32880    soap rope heavy duty brick soap toughest gent ...
32884    eau de perfume spray vial mini design house yv...
32885    pokemon plush inch pm gear doll stuffed animal...
32890    brand new high quality enables fast volley ens...
Name: clean_description, Length: 9940, dtype: object

In [18]:
# Remove any empty descriptions or nulls 
df = df.drop(df[ ( (df["clean_description"] == "" ) | ( df["clean_description"].isnull()) ) ].index)

In [19]:
# For write up - identifying range of descriptions in after cleaning
'''df_description_without_empty_clean = df["clean_description"].apply(lambda x: len(x.split()))
df_description_without_empty_clean.describe([0.03,0.1,0.2,0.3,0.75,0.85,0.90,0.95]) '''

'df_description_without_empty_clean = df["clean_description"].apply(lambda x: len(x.split()))\ndf_description_without_empty_clean.describe([0.03,0.1,0.2,0.3,0.75,0.85,0.90,0.95]) '

In [20]:
# For write up - remove later
from collections import Counter
#Counter(" ".join(df["clean_description"]).split()).most_common(100)

In [21]:
# Reset index of the data frame for easy use
df = df.reset_index()
df

Unnamed: 0,index,description_str,asin,num_words_description,clean_description
0,0,Loud 'N Clear Personal Sound Amplifier allows ...,6546546450,37,loud clear personal sound amplifier allows tur...
1,2,No7 Stay Perfect Foundation now stays perfect ...,7250468162,96,no stay perfect foundation stay perfect longer...
2,4,Lacto Calamine Skin Balance Daily Nourishing L...,7414204790,14,calamine skin balance daily nourishing lotion ...
3,7,"According to the legend, in 1613, two brothers...",8279996397,77,according legend brother native origin black b...
4,8,Novi prevod proslavljene knjige Zadruga objavl...,8637910351,53,nov period knife priv colour martini kay divot...
...,...,...,...,...,...
9935,32879,Theres no finer way for a chap to get ready fo...,B01HIH2QTU,21,no finer way chap ready close gentleman hardwa...
9936,32880,Move over soap on a rope! This heavy-duty Bric...,B01HIHLFOC,18,soap rope heavy duty brick soap toughest gent ...
9937,32884,Eau de parfum spray vial mini design house: yv...,B01HIPOQ2M,21,eau de perfume spray vial mini design house yv...
9938,32885,Pokemon Plush 9.2 Inch / 23cm Gengar Doll Stuf...,B01HIUEEHO,15,pokemon plush inch pm gear doll stuffed animal...


In [22]:
# For evaluation
df.to_csv('yyy.csv', index=False)#CLEAN DESCRIPTIONS for Evaluation 11-4

In [23]:
#                       COREX VECTORIZATION
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

# Setting vectorizer to take phrases in range 1 to 2 (binary for corex)
vectorizer = CountVectorizer(binary=True, ngram_range= (1,2)) 

# Fit the vectorizer and transform it on the description corpus
doc_word = vectorizer.fit_transform(df["clean_description"])

words = vectorizer.get_feature_names_out()
print(len(words))

258992


In [24]:
# Code based on https://github.com/gregversteeg/corex_topic/blob/mastera/corextopic/example/corex_topic_example.ipynb
import corextopic.corextopic as ct
import corextopic.vis_topic as vt

anchor_words = [enviro_p, social_p, economic_p, health_p]

anchored_topic_model = ct.Corex(n_hidden=4, seed = 11)
anchored_topic_model.fit(doc_word, words=words, anchors=anchor_words, anchor_strength=11)

# Same results of corex model for evaluation
vt.vis_rep(anchored_topic_model, column_label=words, prefix='CX Desc')#FINAL 11-4 Descriptions CorEx Model

topics_list = anchored_topic_model.get_topics()

# Get top 10 phrases for each topic
top_words = []
for sublist in topics_list:
    phrase = [item[0] for item in sublist]
    top_words.append(phrase)

for index, phrase in enumerate(top_words):
    print(f"Theme {index+1}:", phrase)
    


Print topics in text file
Theme 1: ['organic', 'eco', 'biodegradable', 'recyclable', 'reusable', 'recycled', 'sustainable', 'no way', 'way connected', 'renewable']
Theme 2: ['cruelty free', 'no animal', 'fair trade', 'cruelty', 'donated', 'ethical', 'non profit', 'animal testing', 'trimmer', 'cordless']
Theme 3: ['recycled', 'fair trade', 'renewable', 'community', 'need', 'dimension', 'safe', 'fit people', 'hair', 'want']
Theme 4: ['organic', 'paraben free', 'fragrance free', 'sulfate free', 'skin', 'ingredient', 'oil', 'natural', 'extract', 'help']


In [25]:
# GLDA # GLDA # GLDA # GLDA # GLDA # GLDA # GLDA # GLDA # GLDA # GLDA # GLDA # GLDA # GLDA # GLDA # GLDA # GLDA # GLDA # GLDA # GLDA # GLDA # GLDA # GLDA # GLDA # GLDA 

In [26]:
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>     GLDA VECTORIZATION         <<<<<<<<<<<<<<<<<<<<<<<<<<<<<

from sklearn.feature_extraction.text import CountVectorizer

# Setting vectorizer to take phrases in range 1 to 2 (non binary for GLDA)
vectorizer_GLDA = CountVectorizer(binary=False, ngram_range= (1,2))

# Fit the vectorizer and transform & fit it on the description corpus
vectorised_descriptions_corpus = vectorizer_GLDA.fit_transform(df["clean_description"])
# TODO COMMENT
word2id = vectorizer_GLDA.vocabulary_ #columns dictionary

vocab_GLDA = vectorizer_GLDA.get_feature_names_out() 
print(len(vocab_GLDA))


258992


In [27]:
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>      Guided - LDA          <<<<<<<<<<<<<<<<<<<<<<<<<<<<<
# Based on https://www.kaggle.com/code/nvpsani/topic-modelling-using-guided-lda who used workaround GLDA as well
from lda import guidedlda as glda
import numpy as np
# Anchor words list
anchor_words = [enviro_p, social_p, economic_p, health_p]
# Defining model
model = glda.GuidedLDA(n_topics=4, alpha=12.5, eta=0.1, n_iter=2000, random_state=99, refresh=50)

# Topics for the model: create a mapping from feature column index to topic ID
anchor_topics = {}
for topic_id in range(len(anchor_words)):
    key_word_list = anchor_words[topic_id]
    for word in key_word_list:
        col_index = word2id[word]
        anchor_topics[col_index] = topic_id

# Train model
model.fit(vectorised_descriptions_corpus, seed_topics=anchor_topics, seed_confidence=1.1)

# Extract the importance of each feature towards each topic, and display the top ones
NUM_TOP_FEATURES = 10
words_in_topics = model.topic_word_
for topic_id in range(len(words_in_topics)):
    # Get the feature importance for this one topic
    feature_importance = words_in_topics[topic_id]
    # Get the indices of the list when sorted
    sorted_indices = np.argsort(feature_importance)
    # Reverse to get most important first, and take the most important ones
    important_features_indices = sorted_indices[::-1][:NUM_TOP_FEATURES]
    # Use our vocab list to map to the feature names
    top_words = np.array(vocab_GLDA)[important_features_indices]
    
    print(f'Topic {topic_id}: {", ".join(top_words)}')

INFO:lda:n_documents: 9940
INFO:lda:vocab_size: 258992
INFO:lda:n_words: 833012
INFO:lda:n_topics: 4
INFO:lda:n_iter: 2000
INFO:lda:<0> log likelihood: -10435791
INFO:lda:<50> log likelihood: -9501028
INFO:lda:<100> log likelihood: -9494119
INFO:lda:<150> log likelihood: -9492199
INFO:lda:<200> log likelihood: -9491280
INFO:lda:<250> log likelihood: -9491900
INFO:lda:<300> log likelihood: -9491143
INFO:lda:<350> log likelihood: -9490851
INFO:lda:<400> log likelihood: -9491041
INFO:lda:<450> log likelihood: -9489746
INFO:lda:<500> log likelihood: -9493973
INFO:lda:<550> log likelihood: -9491381
INFO:lda:<600> log likelihood: -9492189
INFO:lda:<650> log likelihood: -9492700
INFO:lda:<700> log likelihood: -9489726
INFO:lda:<750> log likelihood: -9492319
INFO:lda:<800> log likelihood: -9494881
INFO:lda:<850> log likelihood: -9491351
INFO:lda:<900> log likelihood: -9492681
INFO:lda:<950> log likelihood: -9493106
INFO:lda:<1000> log likelihood: -9493619
INFO:lda:<1050> log likelihood: -94912

Topic 0: hair, use, color, lip, nail, brush, easy, water, soft, long
Topic 1: skin, help, natural, free, oil, formula, ingredient, cream, dry, vitamin
Topic 2: size, color, quality, high, pm, new, material, product, design, set
Topic 3: oil, fragrance, body, product, scent, soap, blend, essential, natural, note


In [28]:
# Transform the model to be able to set a threshold and get document topic lables
doc_topic = model.transform(vectorised_descriptions_corpus)
print(doc_topic)

# Set threshold
threshold = 0.45

doc_topic_thresholded = (doc_topic >= threshold).astype(int)
print(doc_topic_thresholded)


# Output it
df_glda_labels = pd.DataFrame(doc_topic_thresholded)
df_glda_labels.to_csv('Desc GLDA.txt', sep='\t', header=False)#FINAL 11-4 Descriptions GLDA Model

[[0.36675832 0.12444992 0.20783941 0.30095235]
 [0.62246763 0.27847643 0.0504222  0.04863373]
 [0.23821732 0.64294309 0.04099314 0.07784646]
 ...
 [0.04250318 0.0403353  0.0605309  0.85663061]
 [0.05972019 0.19781134 0.58722203 0.15524643]
 [0.21776969 0.06547084 0.50432374 0.21243573]]
[[0 0 0 0]
 [1 0 0 0]
 [0 1 0 0]
 ...
 [0 0 0 1]
 [0 0 1 0]
 [0 0 1 0]]
