In [1]:
import pandas as pd
import numpy as np
import nltk

from nltk import word_tokenize
from sqlalchemy.dialects.mssql.information_schema import columns

In [2]:
df = pd.read_csv('bbc_text_cls.csv')
df

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business
...,...,...
2220,BT program to beat dialler scams\n\nBT is intr...,tech
2221,Spam e-mails tempt net shoppers\n\nComputer us...,tech
2222,Be careful how you code\n\nA new European dire...,tech
2223,US cyber security chief resigns\n\nThe man mak...,tech


# From Scratch

In [18]:
# populate word2idx
# convert documents into sequences of ints / ids / indices
idx = 0
word2idx = {}
tokenized_docs = []
for doc in df['text']:
    words = word_tokenize(doc.lower())
    doc_as_int = []
    for word in words:
        if word not in word2idx:
            word2idx[word] = idx
            idx += 1
        # save for later
        doc_as_int.append(word2idx[word])
    tokenized_docs.append(doc_as_int)

In [19]:
word2idx

{'ad': 0,
 'sales': 1,
 'boost': 2,
 'time': 3,
 'warner': 4,
 'profit': 5,
 'quarterly': 6,
 'profits': 7,
 'at': 8,
 'us': 9,
 'media': 10,
 'giant': 11,
 'timewarner': 12,
 'jumped': 13,
 '76': 14,
 '%': 15,
 'to': 16,
 '$': 17,
 '1.13bn': 18,
 '(': 19,
 '£600m': 20,
 ')': 21,
 'for': 22,
 'the': 23,
 'three': 24,
 'months': 25,
 'december': 26,
 ',': 27,
 'from': 28,
 '639m': 29,
 'year-earlier': 30,
 '.': 31,
 'firm': 32,
 'which': 33,
 'is': 34,
 'now': 35,
 'one': 36,
 'of': 37,
 'biggest': 38,
 'investors': 39,
 'in': 40,
 'google': 41,
 'benefited': 42,
 'high-speed': 43,
 'internet': 44,
 'connections': 45,
 'and': 46,
 'higher': 47,
 'advert': 48,
 'said': 49,
 'fourth': 50,
 'quarter': 51,
 'rose': 52,
 '2': 53,
 '11.1bn': 54,
 '10.9bn': 55,
 'its': 56,
 'were': 57,
 'buoyed': 58,
 'by': 59,
 'one-off': 60,
 'gains': 61,
 'offset': 62,
 'a': 63,
 'dip': 64,
 'bros': 65,
 'less': 66,
 'users': 67,
 'aol': 68,
 'on': 69,
 'friday': 70,
 'that': 71,
 'it': 72,
 'owns': 73,
 '8

In [5]:
tokenized_docs[:11]

[[0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  16,
  26,
  27,
  28,
  17,
  29,
  30,
  31,
  23,
  32,
  27,
  33,
  34,
  35,
  36,
  37,
  23,
  38,
  39,
  40,
  41,
  27,
  42,
  28,
  1,
  37,
  43,
  44,
  45,
  46,
  47,
  48,
  1,
  31,
  12,
  49,
  50,
  51,
  1,
  52,
  53,
  15,
  16,
  17,
  54,
  28,
  17,
  55,
  31,
  56,
  7,
  57,
  58,
  59,
  60,
  61,
  33,
  62,
  63,
  5,
  64,
  8,
  4,
  65,
  27,
  46,
  66,
  67,
  22,
  68,
  31,
  3,
  4,
  49,
  69,
  70,
  71,
  72,
  35,
  73,
  74,
  15,
  37,
  75,
  41,
  31,
  76,
  56,
  77,
  44,
  78,
  27,
  68,
  27,
  79,
  80,
  81,
  82,
  31,
  72,
  83,
  84,
  85,
  40,
  23,
  50,
  51,
  7,
  57,
  86,
  87,
  40,
  23,
  88,
  24,
  89,
  31,
  90,
  27,
  23,
  91,
  49,
  68,
  92,
  93,
  5,
  94,
  95,
  96,
  52,
  74,
  15,
  69,
  23,
  97,
  37,
  98,
  44,
  99,
  100,
  31,
  72,
  101,
  16,

In [20]:
# reverse mapping
# if you do it smarter you can store it as a list
idx2word = {v:k for k,v in word2idx.items()}

Note that this is somewhat inefficient because we're using a dictionary data structure, even though our indices are only integers from zero up to the vocabulary size.

It would be more efficient to store a list instead.

Since the index to a list is already an integer as an exercise, you may want to think about how do you store it that way instead?

In [13]:
idx2word_list = []
for k,v in word2idx.items():
    idx2word_list.append(k)

In [7]:
idx2word

{0: 'ad',
 1: 'sales',
 2: 'boost',
 3: 'time',
 4: 'warner',
 5: 'profit',
 6: 'quarterly',
 7: 'profits',
 8: 'at',
 9: 'us',
 10: 'media',
 11: 'giant',
 12: 'timewarner',
 13: 'jumped',
 14: '76',
 15: '%',
 16: 'to',
 17: '$',
 18: '1.13bn',
 19: '(',
 20: '£600m',
 21: ')',
 22: 'for',
 23: 'the',
 24: 'three',
 25: 'months',
 26: 'december',
 27: ',',
 28: 'from',
 29: '639m',
 30: 'year-earlier',
 31: '.',
 32: 'firm',
 33: 'which',
 34: 'is',
 35: 'now',
 36: 'one',
 37: 'of',
 38: 'biggest',
 39: 'investors',
 40: 'in',
 41: 'google',
 42: 'benefited',
 43: 'high-speed',
 44: 'internet',
 45: 'connections',
 46: 'and',
 47: 'higher',
 48: 'advert',
 49: 'said',
 50: 'fourth',
 51: 'quarter',
 52: 'rose',
 53: '2',
 54: '11.1bn',
 55: '10.9bn',
 56: 'its',
 57: 'were',
 58: 'buoyed',
 59: 'by',
 60: 'one-off',
 61: 'gains',
 62: 'offset',
 63: 'a',
 64: 'dip',
 65: 'bros',
 66: 'less',
 67: 'users',
 68: 'aol',
 69: 'on',
 70: 'friday',
 71: 'that',
 72: 'it',
 73: 'owns',
 74

In [14]:
idx2word_list

['ad',
 'sales',
 'boost',
 'time',
 'warner',
 'profit',
 'quarterly',
 'profits',
 'at',
 'us',
 'media',
 'giant',
 'timewarner',
 'jumped',
 '76',
 '%',
 'to',
 '$',
 '1.13bn',
 '(',
 '£600m',
 ')',
 'for',
 'the',
 'three',
 'months',
 'december',
 ',',
 'from',
 '639m',
 'year-earlier',
 '.',
 'firm',
 'which',
 'is',
 'now',
 'one',
 'of',
 'biggest',
 'investors',
 'in',
 'google',
 'benefited',
 'high-speed',
 'internet',
 'connections',
 'and',
 'higher',
 'advert',
 'said',
 'fourth',
 'quarter',
 'rose',
 '2',
 '11.1bn',
 '10.9bn',
 'its',
 'were',
 'buoyed',
 'by',
 'one-off',
 'gains',
 'offset',
 'a',
 'dip',
 'bros',
 'less',
 'users',
 'aol',
 'on',
 'friday',
 'that',
 'it',
 'owns',
 '8',
 'search-engine',
 'but',
 'own',
 'business',
 'had',
 'has',
 'mixed',
 'fortunes',
 'lost',
 '464,000',
 'subscribers',
 'lower',
 'than',
 'preceding',
 'quarters',
 'however',
 'company',
 "'s",
 'underlying',
 'before',
 'exceptional',
 'items',
 'back',
 'stronger',
 'adverti

In [21]:
# number of documents
N = len(df['text'])
N

2225

In [22]:
len(tokenized_docs)

2225

In [23]:
# number of words
V = len(word2idx)
V

34613

In [24]:
# instantiate term-frequency matrix
# note: could have also used count vectorizer
tf = np.zeros((N, V))

In [16]:
tokenized_docs

[[0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  16,
  26,
  27,
  28,
  17,
  29,
  30,
  31,
  23,
  32,
  27,
  33,
  34,
  35,
  36,
  37,
  23,
  38,
  39,
  40,
  41,
  27,
  42,
  28,
  1,
  37,
  43,
  44,
  45,
  46,
  47,
  48,
  1,
  31,
  12,
  49,
  50,
  51,
  1,
  52,
  53,
  15,
  16,
  17,
  54,
  28,
  17,
  55,
  31,
  56,
  7,
  57,
  58,
  59,
  60,
  61,
  33,
  62,
  63,
  5,
  64,
  8,
  4,
  65,
  27,
  46,
  66,
  67,
  22,
  68,
  31,
  3,
  4,
  49,
  69,
  70,
  71,
  72,
  35,
  73,
  74,
  15,
  37,
  75,
  41,
  31,
  76,
  56,
  77,
  44,
  78,
  27,
  68,
  27,
  79,
  80,
  81,
  82,
  31,
  72,
  83,
  84,
  85,
  40,
  23,
  50,
  51,
  7,
  57,
  86,
  87,
  40,
  23,
  88,
  24,
  89,
  31,
  90,
  27,
  23,
  91,
  49,
  68,
  92,
  93,
  5,
  94,
  95,
  96,
  52,
  74,
  15,
  69,
  23,
  97,
  37,
  98,
  44,
  99,
  100,
  31,
  72,
  101,
  16,

In [17]:
word2idx

{'ad': 0,
 'sales': 1,
 'boost': 2,
 'time': 3,
 'warner': 4,
 'profit': 5,
 'quarterly': 6,
 'profits': 7,
 'at': 8,
 'us': 9,
 'media': 10,
 'giant': 11,
 'timewarner': 12,
 'jumped': 13,
 '76': 14,
 '%': 15,
 'to': 16,
 '$': 17,
 '1.13bn': 18,
 '(': 19,
 '£600m': 20,
 ')': 21,
 'for': 22,
 'the': 23,
 'three': 24,
 'months': 25,
 'december': 26,
 ',': 27,
 'from': 28,
 '639m': 29,
 'year-earlier': 30,
 '.': 31,
 'firm': 32,
 'which': 33,
 'is': 34,
 'now': 35,
 'one': 36,
 'of': 37,
 'biggest': 38,
 'investors': 39,
 'in': 40,
 'google': 41,
 'benefited': 42,
 'high-speed': 43,
 'internet': 44,
 'connections': 45,
 'and': 46,
 'higher': 47,
 'advert': 48,
 'said': 49,
 'fourth': 50,
 'quarter': 51,
 'rose': 52,
 '2': 53,
 '11.1bn': 54,
 '10.9bn': 55,
 'its': 56,
 'were': 57,
 'buoyed': 58,
 'by': 59,
 'one-off': 60,
 'gains': 61,
 'offset': 62,
 'a': 63,
 'dip': 64,
 'bros': 65,
 'less': 66,
 'users': 67,
 'aol': 68,
 'on': 69,
 'friday': 70,
 'that': 71,
 'it': 72,
 'owns': 73,
 '8

In [25]:
# populate term-frequency counts
for i, doc_as_int in enumerate(tokenized_docs):
    for j in doc_as_int:
        tf[i, j] += 1

In [26]:
type(tf)

numpy.ndarray

In [27]:
tf.shape

(2225, 34613)

In [28]:
# compute IDF
document_freq = np.sum(tf > 0, axis=0) # document frequency (shape = (V, ))
idf = np.log(N / document_freq)

In [20]:
document_freq

array([ 12, 204, 127, ...,   1,   1,   1])

In [21]:
idf

array([5.22260554, 2.3893922 , 2.86332511, ..., 7.70751219, 7.70751219,
       7.70751219])

In [29]:
# compute TF-IDF
tf_idf = tf * idf # numpy uses broadcasting to do the multiplication

In [30]:
np.random.seed(123)

In [50]:
# pick a random document, show the top 5 terms (in terms of tf_idf score)
i = np.random.choice(N)
row = df.iloc[i]
print("Label:", row['labels'])
print("Text:", row['text'].split("\n", 1)[0])
print("Top 5 terms")

scores = tf_idf[i]
indices = (-scores).argsort() # sort in desc order
for j in indices[:5]:
    print(idx2word[j], scores[j])

Label: business
Text: 'Strong dollar' call halts slide
Top 5 terms
155m 34.57480403445754
canal 19.739693889442236
captain 18.96365350044135
159 17.983191652135147
16 16.074530961945616


# Exercise: use CountVectorizer to form the counts instead

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

In [35]:
docs = df['text']
vectorizer = CountVectorizer()
tf2 = vectorizer.fit_transform(docs)

In [36]:
tf2

<2225x29421 sparse matrix of type '<class 'numpy.int64'>'
	with 449254 stored elements in Compressed Sparse Row format>

In [37]:
tf2 = tf2.toarray()

In [8]:
vectorizer.get_feature_names_out()

array(['00', '000', '0001', ..., 'zutons', 'zvonareva', 'zvyagintsev'],
      dtype=object)

In [32]:
word2idx = vectorizer.vocabulary_
word2idx

{'ad': 1750,
 'sales': 23062,
 'boost': 4349,
 'time': 26665,
 'warner': 28502,
 'profit': 20825,
 'quarterly': 21204,
 'profits': 20832,
 'at': 2994,
 'us': 27901,
 'media': 17064,
 'giant': 11826,
 'timewarner': 26677,
 'jumped': 14933,
 '76': 1249,
 'to': 26730,
 '13bn': 184,
 '600m': 1105,
 'for': 11102,
 'the': 26462,
 'three': 26566,
 'months': 17692,
 'december': 7736,
 'from': 11377,
 '639m': 1136,
 'year': 29256,
 'earlier': 9183,
 'firm': 10842,
 'which': 28749,
 'is': 14510,
 'now': 18557,
 'one': 18837,
 'of': 18726,
 'biggest': 3963,
 'investors': 14406,
 'in': 13801,
 'google': 12066,
 'benefited': 3816,
 'high': 13002,
 'speed': 24836,
 'internet': 14299,
 'connections': 6653,
 'and': 2429,
 'higher': 13005,
 'advert': 1885,
 'said': 23041,
 'fourth': 11224,
 'quarter': 21203,
 'rose': 22793,
 '11': 106,
 '1bn': 461,
 '10': 68,
 '9bn': 1466,
 'its': 14571,
 'were': 28694,
 'buoyed': 4868,
 'by': 4980,
 'off': 18730,
 'gains': 11534,
 'offset': 18758,
 'dip': 8384,
 'bros

In [33]:
len(word2idx)

29421

In [34]:
# reverse mapping
idx2word = {v:k for k,v in word2idx.items()}
idx2word

{1750: 'ad',
 23062: 'sales',
 4349: 'boost',
 26665: 'time',
 28502: 'warner',
 20825: 'profit',
 21204: 'quarterly',
 20832: 'profits',
 2994: 'at',
 27901: 'us',
 17064: 'media',
 11826: 'giant',
 26677: 'timewarner',
 14933: 'jumped',
 1249: '76',
 26730: 'to',
 184: '13bn',
 1105: '600m',
 11102: 'for',
 26462: 'the',
 26566: 'three',
 17692: 'months',
 7736: 'december',
 11377: 'from',
 1136: '639m',
 29256: 'year',
 9183: 'earlier',
 10842: 'firm',
 28749: 'which',
 14510: 'is',
 18557: 'now',
 18837: 'one',
 18726: 'of',
 3963: 'biggest',
 14406: 'investors',
 13801: 'in',
 12066: 'google',
 3816: 'benefited',
 13002: 'high',
 24836: 'speed',
 14299: 'internet',
 6653: 'connections',
 2429: 'and',
 13005: 'higher',
 1885: 'advert',
 23041: 'said',
 11224: 'fourth',
 21203: 'quarter',
 22793: 'rose',
 106: '11',
 461: '1bn',
 68: '10',
 1466: '9bn',
 14571: 'its',
 28694: 'were',
 4868: 'buoyed',
 4980: 'by',
 18730: 'off',
 11534: 'gains',
 18758: 'offset',
 8384: 'dip',
 4713:

In [39]:
# compute IDF
doc_freqs = np.sum(tf2 > 0, axis=0)
idf2 = np.log(N / doc_freqs)

In [40]:
# compute TF-IDF
tf_idf_2 = tf2 * idf2

In [42]:
N2 = tf2.shape[0]
N2

2225

In [43]:
# TOP 5 scores
j = np.random.choice(N2)
docj = df.iloc[j]
docj

text      Clarke faces ID cards rebellion\n\nCharles Cla...
labels                                             politics
Name: 1122, dtype: object

In [47]:
j = np.random.choice(N2)
docj = df.iloc[j]
docj

print("Label:", docj['labels'])
print("Text:", docj['text'].split("\n", 1)[0])
print("Top 5 terms")
scores2 = tf_idf_2[j]
indices2 = (-scores2).argsort()
for j in indices2[:5]:
    print(idx2word[j], scores2[j])

Label: politics
Text: Blair 'said he would stand down'
Top 5 terms
book 22.453917669890526
brown 20.098612013542038
blair 17.601887405970913
peston 15.66781663443702
mutual 14.99838598049439


# Exercise (hard): use Scipy's csr_matrix instead

You cannot use `X[i, j] += 1`

## Tokenize docs and map tokens into integers (`word2idx`)

In [3]:
# populate word2idx
# convert documents into sequences of ints / ids / indices
idx = 0
word2idx = {}
tokenized_docs = []
for doc in df['text']:
    words = word_tokenize(doc.lower())
    doc_as_int = []
    for word in words:
        if word not in word2idx:
            word2idx[word] = idx
            idx += 1
        # save for later
        doc_as_int.append(word2idx[word])
    tokenized_docs.append(doc_as_int)

In [4]:
len(tokenized_docs[0])

490

In [5]:
len(tokenized_docs[1])

433

In [8]:
i = 0
for key, value in word2idx.items():
    print(f"{key}: {value}")
    i += 1
    if i > 5:
        break

ad: 0
sales: 1
boost: 2
time: 3
warner: 4
profit: 5


## Reverse mapping: `idx2word`

In [4]:
idx2word = {v:k for k,v in word2idx.items()}

In [9]:
i = 0
for key, value in idx2word.items():
    print(f"{key}: {value}")
    i += 1
    if i > 5:
        break

0: ad
1: sales
2: boost
3: time
4: warner
5: profit


## Sparse matrix of term frequencies (`tf`)

In [5]:
# Number of documents N:
N = len(tokenized_docs)
print(f"N: {N}")
# Number of words/tokens V
V = len(word2idx)
print(f"V: {V}")

N: 2225
V: 34613


In [6]:
from scipy.sparse import coo_matrix, csr_matrix

rows = []
columns = []
data = []

# populate term-frequency counts
for i, doc_as_int in enumerate(tokenized_docs):
    token_freqs = {}
    for j in doc_as_int:
        if j in token_freqs:
            token_freqs[j] += 1
        else:
            token_freqs[j] = 1
    columns.extend(token_freqs.keys())
    data.extend(token_freqs.values())
    rows.extend([i]*len(token_freqs))

rows = np.array(rows)
cols = np.array(columns)
data = np.array(data)

# Construct directly in COO format and then convert to CSR (Compressed Sparce Row Matrix):
sparse_coo = coo_matrix((data, (rows, cols)))
tf_sparse = sparse_coo.tocsr()
tf_sparse

<2225x34613 sparse matrix of type '<class 'numpy.int64'>'
	with 469214 stored elements in Compressed Sparse Row format>

In [28]:
sparse_coo.shape

(2225, 34613)

In [30]:
sparse_coo

<2225x34613 sparse matrix of type '<class 'numpy.int64'>'
	with 469214 stored elements in COOrdinate format>

In [22]:
d = {'first': 1, 'second': 2}
d.keys()

dict_keys(['first', 'second'])

## Compute tf-idf-sparse

In [7]:
# compute IDF
document_freq = tf_sparse.getnnz(axis=0) # number of non-zero entries per column. document frequency (shape = (V, ))
idf = np.log(N / document_freq)

In [9]:
len(document_freq)

34613

In [32]:
document_freq

array([ 12, 204, 127, ...,   1,   1,   1])

In [33]:
idf

array([5.22260554, 2.3893922 , 2.86332511, ..., 7.70751219, 7.70751219,
       7.70751219])

In [10]:
# compute tf-idf

tf_idf_sparse = tf_sparse.multiply(idf).tocsr() # if not used .tocsr() it would return a sparse matrix in COOrdinate format
tf_idf_sparse

<2225x34613 sparse matrix of type '<class 'numpy.float64'>'
	with 469214 stored elements in Compressed Sparse Row format>

In [35]:
tf_sparse

<2225x34613 sparse matrix of type '<class 'numpy.int64'>'
	with 469214 stored elements in Compressed Sparse Row format>

In [17]:
# np.random.seed(123)

# pick a random document, show the top 5 terms (in terms of tf_idf score)
i = np.random.choice(N)
row = df.iloc[i]
print("Label:", row['labels'])
print("Text:", row['text'].split("\n", 1)[0])
print("Top 5 terms")

# Identify where row i starts and ends in tf_idf_sparse.data / tf_idf_sparse.indices
start = tf_idf_sparse.indptr[i]
end   = tf_idf_sparse.indptr[i+1]

# Extract the data and column indices for row i
row_data    = tf_idf_sparse.data[start:end]
row_indices = tf_idf_sparse.indices[start:end] # these are in fact column indices (j's) in tf_idf_sparse matrix

# Number of nonzeros in row i
nnz_row = end - start

if nnz_row == 0:
    print(f"Row {i} has no nonzero entries.")
else:
    # Sort by the nonzero values in descending order
    # np.argsort sorts ascending, so we negate row_data to sort descending
    sort_desc = np.argsort(-row_data)

    # Top 5 (or fewer if row has < 5 nonzeros)
    k = min(5, nnz_row)
    top_k = sort_desc[:k]

    # Extract the values and column indices of the top k
    top_values = row_data[top_k]
    top_cols   = row_indices[top_k]

for j, v  in zip(top_cols, top_values):
    print(idx2word[j], v)

Label: tech
Text: BBC leads interactive Bafta wins
Top 5 terms
interactive 38.17598662620552
bafta 24.08570218352088
awards 20.043275756992244
best 14.593518448896493
film 11.134366356291746


In [15]:
i

1110

In [16]:
df['text'][i]

'New rules tackle \'sham weddings\'\n\nNew rules on marriage for foreign nationals living in the UK are coming into force.\n\nFrom Tuesday, most non-EU citizens will need Home Office approval to marry. The Home Office says the new rules are aimed at reducing the number of sham marriages, of which there are estimated to be up to 15,000 a year. But immigrants\' group the Joint Council for the Welfare of Immigrants (JCWI) says the rules breach human rights law and it may mount a legal challenge. When the changes were unveiled last year, immigration minister Des Browne said: "Our aim is to avoid unnecessary disruption of genuine marriages, while providing firm controls to prevent abuse." Under the previous regulations anybody wishing to get married in the UK only had to produce evidence they had been resident in the country for a week and give 15 days notice of the wedding at the local register office.\n\nBut from Tuesday all non-EU nationals, apart from citizens of Switzerland, Liechtenst

### some exploration

In [37]:
import numpy as np
from scipy.sparse import csr_matrix

# Example CSR matrix
row  = np.array([0, 0, 1, 2, 2, 2])
col  = np.array([0, 2, 2, 0, 1, 2])
data = np.array([10, 30, 20, 40, 5, 7])
M = csr_matrix((data, (row, col)), shape=(3, 3))

i = 2  # Suppose we want the top 5 highest nonzero entries in row i

# Identify where row i starts and ends in M.data / M.indices
start = M.indptr[i]
end   = M.indptr[i+1]

# Extract the data and column indices for row i
row_data    = M.data[start:end]
row_indices = M.indices[start:end]

# Number of nonzeros in row i
nnz_row = end - start

if nnz_row == 0:
    print(f"Row {i} has no nonzero entries.")
else:
    # Sort by the nonzero values in descending order
    # np.argsort sorts ascending, so we negate row_data to sort descending
    sort_desc = np.argsort(-row_data)

    # Top 5 (or fewer if row has < 5 nonzeros)
    k = min(5, nnz_row)
    top_k = sort_desc[:k]

    # Extract the values and column indices of the top k
    top_values = row_data[top_k]
    top_cols   = row_indices[top_k]

    print(f"Row {i} nonzero values:", row_data)
    print(f"Row {i} column indices:", row_indices)
    print(f"Top {k} values in row {i}:", top_values)
    print(f"Corresponding column indices:", top_cols)


Row 2 nonzero values: [40  5  7]
Row 2 column indices: [0 1 2]
Top 3 values in row 2: [40  7  5]
Corresponding column indices: [0 2 1]


In [48]:
row_data

array([40,  5,  7])

In [49]:
row_indices

array([0, 1, 2], dtype=int32)

In [39]:
M.toarray()

array([[10,  0, 30],
       [ 0,  0, 20],
       [40,  5,  7]])

In [40]:
M

<3x3 sparse matrix of type '<class 'numpy.int64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [41]:
M.data

array([10, 30, 20, 40,  5,  7])

In [42]:
M.indptr[2]

3

In [43]:
M.indptr[3]

6

In [44]:
M.indptr[0]

0

In [45]:
M.indptr[1]

2

In [46]:
M.indptr[2]

3

In [47]:
M.indptr

array([0, 2, 3, 6], dtype=int32)

In [52]:
x = M[2, :].toarray()
x

array([[40,  5,  7]])

In [53]:
(-x).argsort()

array([[0, 2, 1]])