In [151]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.multiclass import OneVsRestClassifier

In [152]:
import pandas as pd

json_file_path = 'train_for_student.json'
df = pd.read_json(json_file_path)
df = df.T


In [153]:
df.head()
df["CONTEXT"]=df['Title']+". "+df["Abstract"]

In [154]:
df.drop(df.columns[0:2], axis=1, inplace=True)


In [155]:
df=df[['CONTEXT','Classes']]

In [156]:
df

Unnamed: 0,CONTEXT,Classes
1,Activated carbon derived from bacterial cellul...,"[CHE, MATENG]"
2,The algorithm of static hand gesture recogniti...,[CPE]
3,Alternative Redundant Residue Number System Co...,[EE]
4,Comparative study of wax inhibitor performance...,"[PE, ME, CHE]"
5,Undrained lower bound solutions for end bearin...,"[CE, MATSCI]"
...,...,...
450,A portable USB-controlled potentiostat for pap...,"[CPE, CHE]"
451,Literature reviews on applying artificial inte...,"[CPE, EDU]"
452,A multi-parameterized water quality prediction...,"[ENV, EE, CHE]"
453,Semantic Segmentation on Medium-Resolution Sat...,"[EE, CPE, OPTIC, EDU]"


In [157]:
y=df['Classes']

In [158]:
y

1              [CHE, MATENG]
2                      [CPE]
3                       [EE]
4              [PE, ME, CHE]
5               [CE, MATSCI]
               ...          
450               [CPE, CHE]
451               [CPE, EDU]
452           [ENV, EE, CHE]
453    [EE, CPE, OPTIC, EDU]
454     [METAL, EDU, MATSCI]
Name: Classes, Length: 454, dtype: object

In [159]:
multilabel = MultiLabelBinarizer()
y = multilabel.fit_transform(df['Classes'])
y

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [160]:
y[0]

array([0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0])

In [161]:
multilabel.classes_

array(['AGRI', 'BME', 'CE', 'CHE', 'CPE', 'EDU', 'EE', 'ENV', 'IE',
       'MATENG', 'MATH', 'MATSCI', 'ME', 'METAL', 'NANO', 'OPTIC', 'PE',
       'SAFETY'], dtype=object)

In [162]:
len(multilabel.classes_)

18

In [163]:
CLASSES_ARRANGE = ['CE', 'ENV', 'BME', 'PE', 'METAL', 'ME', 'EE',
               'CPE', 'OPTIC', 'NANO', 'CHE', 'MATENG', 'AGRI',
               'EDU', 'IE', 'SAFETY', 'MATH', 'MATSCI']

In [164]:
new_y=pd.DataFrame(y, columns=multilabel.classes_)

In [165]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['CONTEXT'], new_y, test_size=0.2, random_state=0)

In [166]:
f1_scorer = make_scorer(f1_score, average='weighted')

In [167]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(analyzer='word', max_features=1000)),
    ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
])

In [168]:
parameters = {
    'tfidf__max_df': (0.75, 0.85, 0.95),
    'tfidf__min_df': (0.01, 0.05, 0.1),
    'clf__estimator__C': (0.01, 0.1, 1, 10, 100,1000,10000),  # Increased range for C
}

In [169]:
# Grid search to find the best parameters for both the vectorizer and the classifier
grid_search_tune = GridSearchCV(pipeline, parameters, cv=2, scoring=f1_scorer, verbose=2)
grid_search_tune.fit(X_train, y_train)

print("Best parameters set:")
print(grid_search_tune.best_estimator_.steps)

Fitting 2 folds for each of 63 candidates, totalling 126 fits
[CV] END clf__estimator__C=0.01, tfidf__max_df=0.75, tfidf__min_df=0.01; total time=   0.1s




[CV] END clf__estimator__C=0.01, tfidf__max_df=0.75, tfidf__min_df=0.01; total time=   0.1s
[CV] END clf__estimator__C=0.01, tfidf__max_df=0.75, tfidf__min_df=0.05; total time=   0.0s




[CV] END clf__estimator__C=0.01, tfidf__max_df=0.75, tfidf__min_df=0.05; total time=   0.0s
[CV] END clf__estimator__C=0.01, tfidf__max_df=0.75, tfidf__min_df=0.1; total time=   0.0s




[CV] END clf__estimator__C=0.01, tfidf__max_df=0.75, tfidf__min_df=0.1; total time=   0.1s
[CV] END clf__estimator__C=0.01, tfidf__max_df=0.85, tfidf__min_df=0.01; total time=   0.0s




[CV] END clf__estimator__C=0.01, tfidf__max_df=0.85, tfidf__min_df=0.01; total time=   0.0s
[CV] END clf__estimator__C=0.01, tfidf__max_df=0.85, tfidf__min_df=0.05; total time=   0.0s
[CV] END clf__estimator__C=0.01, tfidf__max_df=0.85, tfidf__min_df=0.05; total time=   0.0s




[CV] END clf__estimator__C=0.01, tfidf__max_df=0.85, tfidf__min_df=0.1; total time=   0.0s
[CV] END clf__estimator__C=0.01, tfidf__max_df=0.85, tfidf__min_df=0.1; total time=   0.0s




[CV] END clf__estimator__C=0.01, tfidf__max_df=0.95, tfidf__min_df=0.01; total time=   0.0s
[CV] END clf__estimator__C=0.01, tfidf__max_df=0.95, tfidf__min_df=0.01; total time=   0.0s




[CV] END clf__estimator__C=0.01, tfidf__max_df=0.95, tfidf__min_df=0.05; total time=   0.0s
[CV] END clf__estimator__C=0.01, tfidf__max_df=0.95, tfidf__min_df=0.05; total time=   0.0s
[CV] END clf__estimator__C=0.01, tfidf__max_df=0.95, tfidf__min_df=0.1; total time=   0.0s




[CV] END clf__estimator__C=0.01, tfidf__max_df=0.95, tfidf__min_df=0.1; total time=   0.0s
[CV] END clf__estimator__C=0.1, tfidf__max_df=0.75, tfidf__min_df=0.01; total time=   0.0s




[CV] END clf__estimator__C=0.1, tfidf__max_df=0.75, tfidf__min_df=0.01; total time=   0.0s
[CV] END clf__estimator__C=0.1, tfidf__max_df=0.75, tfidf__min_df=0.05; total time=   0.0s
[CV] END clf__estimator__C=0.1, tfidf__max_df=0.75, tfidf__min_df=0.05; total time=   0.0s




[CV] END clf__estimator__C=0.1, tfidf__max_df=0.75, tfidf__min_df=0.1; total time=   0.0s
[CV] END clf__estimator__C=0.1, tfidf__max_df=0.75, tfidf__min_df=0.1; total time=   0.0s




[CV] END clf__estimator__C=0.1, tfidf__max_df=0.85, tfidf__min_df=0.01; total time=   0.0s
[CV] END clf__estimator__C=0.1, tfidf__max_df=0.85, tfidf__min_df=0.01; total time=   0.0s




[CV] END clf__estimator__C=0.1, tfidf__max_df=0.85, tfidf__min_df=0.05; total time=   0.0s
[CV] END clf__estimator__C=0.1, tfidf__max_df=0.85, tfidf__min_df=0.05; total time=   0.0s




[CV] END clf__estimator__C=0.1, tfidf__max_df=0.85, tfidf__min_df=0.1; total time=   0.0s
[CV] END clf__estimator__C=0.1, tfidf__max_df=0.85, tfidf__min_df=0.1; total time=   0.0s




[CV] END clf__estimator__C=0.1, tfidf__max_df=0.95, tfidf__min_df=0.01; total time=   0.0s
[CV] END clf__estimator__C=0.1, tfidf__max_df=0.95, tfidf__min_df=0.01; total time=   0.0s




[CV] END clf__estimator__C=0.1, tfidf__max_df=0.95, tfidf__min_df=0.05; total time=   0.0s
[CV] END clf__estimator__C=0.1, tfidf__max_df=0.95, tfidf__min_df=0.05; total time=   0.0s
[CV] END clf__estimator__C=0.1, tfidf__max_df=0.95, tfidf__min_df=0.1; total time=   0.0s




[CV] END clf__estimator__C=0.1, tfidf__max_df=0.95, tfidf__min_df=0.1; total time=   0.0s
[CV] END clf__estimator__C=1, tfidf__max_df=0.75, tfidf__min_df=0.01; total time=   0.0s




[CV] END clf__estimator__C=1, tfidf__max_df=0.75, tfidf__min_df=0.01; total time=   0.0s
[CV] END clf__estimator__C=1, tfidf__max_df=0.75, tfidf__min_df=0.05; total time=   0.0s
[CV] END clf__estimator__C=1, tfidf__max_df=0.75, tfidf__min_df=0.05; total time=   0.0s




[CV] END clf__estimator__C=1, tfidf__max_df=0.75, tfidf__min_df=0.1; total time=   0.0s
[CV] END clf__estimator__C=1, tfidf__max_df=0.75, tfidf__min_df=0.1; total time=   0.0s




[CV] END clf__estimator__C=1, tfidf__max_df=0.85, tfidf__min_df=0.01; total time=   0.0s
[CV] END clf__estimator__C=1, tfidf__max_df=0.85, tfidf__min_df=0.01; total time=   0.0s




[CV] END clf__estimator__C=1, tfidf__max_df=0.85, tfidf__min_df=0.05; total time=   0.0s
[CV] END clf__estimator__C=1, tfidf__max_df=0.85, tfidf__min_df=0.05; total time=   0.0s




[CV] END clf__estimator__C=1, tfidf__max_df=0.85, tfidf__min_df=0.1; total time=   0.0s
[CV] END clf__estimator__C=1, tfidf__max_df=0.85, tfidf__min_df=0.1; total time=   0.0s




[CV] END clf__estimator__C=1, tfidf__max_df=0.95, tfidf__min_df=0.01; total time=   0.0s
[CV] END clf__estimator__C=1, tfidf__max_df=0.95, tfidf__min_df=0.01; total time=   0.0s




[CV] END clf__estimator__C=1, tfidf__max_df=0.95, tfidf__min_df=0.05; total time=   0.0s
[CV] END clf__estimator__C=1, tfidf__max_df=0.95, tfidf__min_df=0.05; total time=   0.0s




[CV] END clf__estimator__C=1, tfidf__max_df=0.95, tfidf__min_df=0.1; total time=   0.0s
[CV] END clf__estimator__C=1, tfidf__max_df=0.95, tfidf__min_df=0.1; total time=   0.0s




[CV] END clf__estimator__C=10, tfidf__max_df=0.75, tfidf__min_df=0.01; total time=   0.1s
[CV] END clf__estimator__C=10, tfidf__max_df=0.75, tfidf__min_df=0.01; total time=   0.0s




[CV] END clf__estimator__C=10, tfidf__max_df=0.75, tfidf__min_df=0.05; total time=   0.0s
[CV] END clf__estimator__C=10, tfidf__max_df=0.75, tfidf__min_df=0.05; total time=   0.0s




[CV] END clf__estimator__C=10, tfidf__max_df=0.75, tfidf__min_df=0.1; total time=   0.0s
[CV] END clf__estimator__C=10, tfidf__max_df=0.75, tfidf__min_df=0.1; total time=   0.0s




[CV] END clf__estimator__C=10, tfidf__max_df=0.85, tfidf__min_df=0.01; total time=   0.0s
[CV] END clf__estimator__C=10, tfidf__max_df=0.85, tfidf__min_df=0.01; total time=   0.0s
[CV] END clf__estimator__C=10, tfidf__max_df=0.85, tfidf__min_df=0.05; total time=   0.0s




[CV] END clf__estimator__C=10, tfidf__max_df=0.85, tfidf__min_df=0.05; total time=   0.0s
[CV] END clf__estimator__C=10, tfidf__max_df=0.85, tfidf__min_df=0.1; total time=   0.0s




[CV] END clf__estimator__C=10, tfidf__max_df=0.85, tfidf__min_df=0.1; total time=   0.0s
[CV] END clf__estimator__C=10, tfidf__max_df=0.95, tfidf__min_df=0.01; total time=   0.0s
[CV] END clf__estimator__C=10, tfidf__max_df=0.95, tfidf__min_df=0.01; total time=   0.0s




[CV] END clf__estimator__C=10, tfidf__max_df=0.95, tfidf__min_df=0.05; total time=   0.0s
[CV] END clf__estimator__C=10, tfidf__max_df=0.95, tfidf__min_df=0.05; total time=   0.0s
[CV] END clf__estimator__C=10, tfidf__max_df=0.95, tfidf__min_df=0.1; total time=   0.0s




[CV] END clf__estimator__C=10, tfidf__max_df=0.95, tfidf__min_df=0.1; total time=   0.0s
[CV] END clf__estimator__C=100, tfidf__max_df=0.75, tfidf__min_df=0.01; total time=   0.0s
[CV] END clf__estimator__C=100, tfidf__max_df=0.75, tfidf__min_df=0.01; total time=   0.0s




[CV] END clf__estimator__C=100, tfidf__max_df=0.75, tfidf__min_df=0.05; total time=   0.0s
[CV] END clf__estimator__C=100, tfidf__max_df=0.75, tfidf__min_df=0.05; total time=   0.0s




[CV] END clf__estimator__C=100, tfidf__max_df=0.75, tfidf__min_df=0.1; total time=   0.0s
[CV] END clf__estimator__C=100, tfidf__max_df=0.75, tfidf__min_df=0.1; total time=   0.0s




[CV] END clf__estimator__C=100, tfidf__max_df=0.85, tfidf__min_df=0.01; total time=   0.0s
[CV] END clf__estimator__C=100, tfidf__max_df=0.85, tfidf__min_df=0.01; total time=   0.0s
[CV] END clf__estimator__C=100, tfidf__max_df=0.85, tfidf__min_df=0.05; total time=   0.0s




[CV] END clf__estimator__C=100, tfidf__max_df=0.85, tfidf__min_df=0.05; total time=   0.0s
[CV] END clf__estimator__C=100, tfidf__max_df=0.85, tfidf__min_df=0.1; total time=   0.0s




[CV] END clf__estimator__C=100, tfidf__max_df=0.85, tfidf__min_df=0.1; total time=   0.0s
[CV] END clf__estimator__C=100, tfidf__max_df=0.95, tfidf__min_df=0.01; total time=   0.0s
[CV] END clf__estimator__C=100, tfidf__max_df=0.95, tfidf__min_df=0.01; total time=   0.0s




[CV] END clf__estimator__C=100, tfidf__max_df=0.95, tfidf__min_df=0.05; total time=   0.0s
[CV] END clf__estimator__C=100, tfidf__max_df=0.95, tfidf__min_df=0.05; total time=   0.0s
[CV] END clf__estimator__C=100, tfidf__max_df=0.95, tfidf__min_df=0.1; total time=   0.0s




[CV] END clf__estimator__C=100, tfidf__max_df=0.95, tfidf__min_df=0.1; total time=   0.0s
[CV] END clf__estimator__C=1000, tfidf__max_df=0.75, tfidf__min_df=0.01; total time=   0.0s
[CV] END clf__estimator__C=1000, tfidf__max_df=0.75, tfidf__min_df=0.01; total time=   0.0s




[CV] END clf__estimator__C=1000, tfidf__max_df=0.75, tfidf__min_df=0.05; total time=   0.0s
[CV] END clf__estimator__C=1000, tfidf__max_df=0.75, tfidf__min_df=0.05; total time=   0.0s




[CV] END clf__estimator__C=1000, tfidf__max_df=0.75, tfidf__min_df=0.1; total time=   0.0s
[CV] END clf__estimator__C=1000, tfidf__max_df=0.75, tfidf__min_df=0.1; total time=   0.0s




[CV] END clf__estimator__C=1000, tfidf__max_df=0.85, tfidf__min_df=0.01; total time=   0.0s
[CV] END clf__estimator__C=1000, tfidf__max_df=0.85, tfidf__min_df=0.01; total time=   0.0s
[CV] END clf__estimator__C=1000, tfidf__max_df=0.85, tfidf__min_df=0.05; total time=   0.0s




[CV] END clf__estimator__C=1000, tfidf__max_df=0.85, tfidf__min_df=0.05; total time=   0.0s
[CV] END clf__estimator__C=1000, tfidf__max_df=0.85, tfidf__min_df=0.1; total time=   0.0s




[CV] END clf__estimator__C=1000, tfidf__max_df=0.85, tfidf__min_df=0.1; total time=   0.0s
[CV] END clf__estimator__C=1000, tfidf__max_df=0.95, tfidf__min_df=0.01; total time=   0.0s
[CV] END clf__estimator__C=1000, tfidf__max_df=0.95, tfidf__min_df=0.01; total time=   0.0s




[CV] END clf__estimator__C=1000, tfidf__max_df=0.95, tfidf__min_df=0.05; total time=   0.0s
[CV] END clf__estimator__C=1000, tfidf__max_df=0.95, tfidf__min_df=0.05; total time=   0.0s
[CV] END clf__estimator__C=1000, tfidf__max_df=0.95, tfidf__min_df=0.1; total time=   0.0s




[CV] END clf__estimator__C=1000, tfidf__max_df=0.95, tfidf__min_df=0.1; total time=   0.0s
[CV] END clf__estimator__C=10000, tfidf__max_df=0.75, tfidf__min_df=0.01; total time=   0.0s
[CV] END clf__estimator__C=10000, tfidf__max_df=0.75, tfidf__min_df=0.01; total time=   0.0s




[CV] END clf__estimator__C=10000, tfidf__max_df=0.75, tfidf__min_df=0.05; total time=   0.0s
[CV] END clf__estimator__C=10000, tfidf__max_df=0.75, tfidf__min_df=0.05; total time=   0.0s




[CV] END clf__estimator__C=10000, tfidf__max_df=0.75, tfidf__min_df=0.1; total time=   0.0s
[CV] END clf__estimator__C=10000, tfidf__max_df=0.75, tfidf__min_df=0.1; total time=   0.0s




[CV] END clf__estimator__C=10000, tfidf__max_df=0.85, tfidf__min_df=0.01; total time=   0.0s
[CV] END clf__estimator__C=10000, tfidf__max_df=0.85, tfidf__min_df=0.01; total time=   0.0s
[CV] END clf__estimator__C=10000, tfidf__max_df=0.85, tfidf__min_df=0.05; total time=   0.0s




[CV] END clf__estimator__C=10000, tfidf__max_df=0.85, tfidf__min_df=0.05; total time=   0.0s
[CV] END clf__estimator__C=10000, tfidf__max_df=0.85, tfidf__min_df=0.1; total time=   0.0s




[CV] END clf__estimator__C=10000, tfidf__max_df=0.85, tfidf__min_df=0.1; total time=   0.0s
[CV] END clf__estimator__C=10000, tfidf__max_df=0.95, tfidf__min_df=0.01; total time=   0.0s
[CV] END clf__estimator__C=10000, tfidf__max_df=0.95, tfidf__min_df=0.01; total time=   0.0s




[CV] END clf__estimator__C=10000, tfidf__max_df=0.95, tfidf__min_df=0.05; total time=   0.0s
[CV] END clf__estimator__C=10000, tfidf__max_df=0.95, tfidf__min_df=0.05; total time=   0.0s
[CV] END clf__estimator__C=10000, tfidf__max_df=0.95, tfidf__min_df=0.1; total time=   0.0s




[CV] END clf__estimator__C=10000, tfidf__max_df=0.95, tfidf__min_df=0.1; total time=   0.0s
Best parameters set:
[('tfidf', TfidfVectorizer(max_df=0.85, max_features=1000, min_df=0.01)), ('clf', OneVsRestClassifier(estimator=LinearSVC(C=100), n_jobs=1))]




In [170]:
# Prediction & Evaluation
y_pred = grid_search_tune.predict(X_test)

In [171]:
def print_score(y_true, y_pred):
    print('F1 score: {:.2f}'.format(f1_score(y_true, y_pred, average='weighted')))
    print('Recall score: {:.2f}'.format(recall_score(y_true, y_pred, average='weighted')))
    # Add more metrics here if needed

print_score(y_test, y_pred)

F1 score: 0.54
Recall score: 0.50


In [172]:
json_file_path = 'test_for_student.json'
test_df = pd.read_json(json_file_path)
test_df = test_df.T
test_df["CONTEXT"]=test_df['Title']+'. '+test_df["Abstract"]
test_df.drop(test_df.columns[0:2], axis=1, inplace=True)

In [173]:
test_df.head()

Unnamed: 0,CONTEXT
001eval,Comparative Electrical Energy Yield Performanc...
002eval,Effects of graphene nanoplatelets on bio-based...
003eval,Anti-inflammatory action of two novel peptides...
004eval,Efficient all-and-one support vector machines ...
005eval,Driver identification using histogram and neur...


In [174]:
test_predictions = grid_search_tune.predict(test_df['CONTEXT'])
print(test_predictions)

[[0 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [175]:
classes_array = multilabel.classes_.tolist()

In [176]:
columns = ['id']+classes_array
data = []

# Enumerate through predictions
for i, pred in enumerate(test_predictions):
    # Create a row with id and predictions
    row = ['{:03d}eval'.format(i+1)] + list(pred)
    data.append(row)

new_df = pd.DataFrame(data, columns=columns)
print(new_df)


          id  AGRI  BME  CE  CHE  CPE  EDU  EE  ENV  IE  MATENG  MATH  MATSCI  \
0    001eval     0    0   0    1    0    0   1    0   0       0     0       0   
1    002eval     0    0   1    1    0    0   1    0   0       0     0       1   
2    003eval     0    0   0    1    0    0   0    0   0       0     0       0   
3    004eval     0    0   0    0    0    0   1    0   0       0     1       0   
4    005eval     0    0   0    0    1    0   0    0   0       0     0       0   
..       ...   ...  ...  ..  ...  ...  ...  ..  ...  ..     ...   ...     ...   
146  147eval     0    0   0    1    0    0   0    1   0       1     0       1   
147  148eval     0    0   0    0    1    0   1    0   0       0     1       0   
148  149eval     0    0   0    1    0    0   0    0   0       0     0       0   
149  150eval     0    0   0    1    0    0   0    0   0       0     0       1   
150  151eval     0    0   0    0    0    0   0    0   0       0     1       0   

     ME  METAL  NANO  OPTIC

In [177]:
# Rearrange the columns of the DataFrame
new_df = new_df[['id'] + CLASSES_ARRANGE]

# Print the rearranged DataFrame
print(new_df)

          id  CE  ENV  BME  PE  METAL  ME  EE  CPE  OPTIC  NANO  CHE  MATENG  \
0    001eval   0    0    0   0      0   0   1    0      0     0    1       0   
1    002eval   1    0    0   0      0   0   1    0      0     0    1       0   
2    003eval   0    0    0   0      0   0   0    0      0     0    1       0   
3    004eval   0    0    0   0      0   0   1    0      0     0    0       0   
4    005eval   0    0    0   0      0   0   0    1      0     0    0       0   
..       ...  ..  ...  ...  ..    ...  ..  ..  ...    ...   ...  ...     ...   
146  147eval   0    1    0   0      1   0   0    0      0     0    1       1   
147  148eval   0    0    0   0      0   0   1    1      0     0    0       0   
148  149eval   0    0    0   0      0   0   0    0      0     0    1       0   
149  150eval   0    0    0   0      0   1   0    0      0     0    1       0   
150  151eval   0    0    0   0      0   0   0    0      0     0    0       0   

     AGRI  EDU  IE  SAFETY  MATH  MATSC

In [178]:
row_sums = new_df.iloc[:, 1:].sum(axis=1)
#print(row_sums)
# Check how many of these sums are equal to 0
num_rows_with_sum_zero = (row_sums == 0).sum()

print(num_rows_with_sum_zero,max(row_sums))

3 7


In [179]:

# Assuming 'new_df' is your DataFrame to be saved
path='kaggle_submission2.csv'
# Save DataFrame to CSV
new_df.to_csv(path, index=False)

print("DataFrame saved to 'kaggle_submission.csv'")

DataFrame saved to 'kaggle_submission.csv'
