In [61]:
import pandas as pd
import random

def analyze_csv(file_path, max_rows=1000):
    # Load the CSV file as strings to avoid automatic type conversion
    df = pd.read_csv(file_path, dtype=str)

    # Limit to first 1000 rows
    df = df.head(max_rows)

    # Initialize lists for random values and column indices
    random_values = []
    column_indices = []

    # Get the list of column names
    columns = df.columns.tolist()

    for i, column in enumerate(columns):
        column_data = df[column].dropna().tolist()

        if not column_data:  # Skip empty columns
            random_values.append("empty")
            column_indices.append(i)
            continue

        # Check if the column data can be converted to float or int
        sample_value = column_data[0]
        if is_float(sample_value):
            column_data = [float(value) for value in column_data if is_float(value)]
        elif is_int(sample_value):
            column_data = [int(value) for value in column_data if is_int(value)]
        else:
            column_data = [str(value) for value in column_data]  # Convert to strings for non-numeric

        # Take a random sample value
        random_value = random.choice(column_data)
        random_values.append(random_value)
        column_indices.append(i)

    return random_values, column_indices

def is_float(value):
    try:
        float(value)
        return '.' in str(value)  # Ensure it's a float by checking for decimal point
    except ValueError:
        return False

def is_int(value):
    try:
        int(value)
        return '.' not in str(value)  # Ensure it's an integer by checking for absence of decimal point
    except ValueError:
        return False

# Example usage
file_path = '/content/test.csv'
random_values, column_indices = analyze_csv(file_path)

# Print the results
print("Random Values from each column:", random_values)
print("Column Indices:", column_indices)


Random Values from each column: ['USA', 515685321, 9.34, 'empty', 20241111, 'empty', 659455638, 'empty', 54596, 20241020, 'empty', 20243088, 'empty', '2022010112:20', 'empty', 'D', 'empty', 'empty', 'Y', 'empty', 'N', 'empty', 202410101530]
Column Indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]


Empty seprate

In [62]:
def separate_empty_values(values, indices):
    # Initialize lists to store matched and non-matched values and their indices
    matched_values = []
    matched_indices = []
    non_matched_values = []
    non_matched_indices = []

    # Iterate over the values and indices
    for value, index in zip(values, indices):
        if value == 'empty':
            matched_values.append(value)
            matched_indices.append(index)
        else:
            non_matched_values.append(value)
            non_matched_indices.append(index)

    return matched_values, matched_indices, non_matched_values, non_matched_indices

# Example usage
values = random_values #['Country code', 'Indi', 'yes or no ', 'yes or no ', 'empty', 20241111.0, 'empty', 659455678.0, 'empty', '2022010112:19', 'empty']
indices = column_indices #[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

matched_values, matched_indices, non_matched_values, non_matched_indices = separate_empty_values(values, indices)

# Print the results
print("Matched List:", matched_values)
print("Matched Indices:", matched_indices)
print("Non-Matched List:", non_matched_values)
print("Non-Matched Indices:", non_matched_indices)


Matched List: ['empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty']
Matched Indices: [3, 5, 7, 10, 12, 14, 16, 17, 19, 21]
Non-Matched List: ['USA', 515685321, 9.34, 20241111, 659455638, 54596, 20241020, 20243088, '2022010112:20', 'D', 'Y', 'N', 202410101530]
Non-Matched Indices: [0, 1, 2, 4, 6, 8, 9, 11, 13, 15, 18, 20, 22]


In [63]:
emp_val=matched_values
emp_ind=matched_indices
print(emp_val)
print(emp_ind)

['empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty']
[3, 5, 7, 10, 12, 14, 16, 17, 19, 21]


In [64]:
val=non_matched_values
ind=non_matched_indices
print(val)
print(ind)

['USA', 515685321, 9.34, 20241111, 659455638, 54596, 20241020, 20243088, '2022010112:20', 'D', 'Y', 'N', 202410101530]
[0, 1, 2, 4, 6, 8, 9, 11, 13, 15, 18, 20, 22]


Know Value Check using ML

In [65]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import joblib

# Load the CSV file
file_path = '/content/knowxlsx.csv'
df = pd.read_csv(file_path)

# Transpose the dataframe so that each value has an associated label
df_t = df.set_index('label').T
df_t = df_t.melt(var_name='label', value_name='value')

# Drop missing values if any
df_t.dropna(inplace=True)

# Encode the labels
label_encoder = LabelEncoder()
df_t['label_encoded'] = label_encoder.fit_transform(df_t['label'])

# Save the label encoder for later use
joblib.dump(label_encoder, 'label_encoder.pkl')

# Encode the values using OneHotEncoder
value_encoder = OneHotEncoder(sparse=False)
X = value_encoder.fit_transform(df_t[['value']])

# Save the value encoder for later use
joblib.dump(value_encoder, 'value_encoder.pkl')

y = df_t['label_encoded']

#df_t.head()




In [66]:
from sklearn.neighbors import KNeighborsClassifier

# Train the model
model = KNeighborsClassifier(n_neighbors=1)
model.fit(X, y)

# Save the trained model
joblib.dump(model, 'trained_model.pkl')
print("Model training completed and saved.")


Model training completed and saved.


In [67]:
import joblib
import numpy as np

# Load the trained model and encoders
model = joblib.load('trained_model.pkl')
label_encoder = joblib.load('label_encoder.pkl')
value_encoder = joblib.load('value_encoder.pkl')

# Function to make predictions and handle unmatched values
def predict_labels_and_handle_unmatched(values_list, indices_list):
    matched_list = []
    non_matched_list = []
    matched_indices = []
    non_matched_indices = []

    for value, index in zip(values_list, indices_list):
        # Check if the value exists in the encoder categories
        if value in value_encoder.categories_[0]:
            value_array = np.array([value]).reshape(-1, 1)
            value_encoded = value_encoder.transform(value_array)
            prediction = model.predict(value_encoded)
            label = label_encoder.inverse_transform(prediction)[0]
            matched_list.append(label)
            matched_indices.append(index)
        else:
            non_matched_list.append(value)
            non_matched_indices.append(index)

    return matched_list, non_matched_list, matched_indices, non_matched_indices

# Example usage
values_list =val # ['USA', 155, 2546, 16545, 'vicjy', 'US']  # Replace with your list of values
indices_list = ind #[0, 1, 2, 3, 4, 5]  # Replace with corresponding indices

matched_list, non_matched_list, matched_indices, non_matched_indices = predict_labels_and_handle_unmatched(values_list, indices_list)

# Print the results
print("Matched List:", matched_list)
print("Non-Matched List:", non_matched_list)
print("Matched Indices:", matched_indices)
print("Non-Matched Indices:", non_matched_indices)


Matched List: ['Country code', 'Indi', 'yes or no ', 'yes or no ']
Non-Matched List: [515685321, 9.34, 20241111, 659455638, 54596, 20241020, 20243088, '2022010112:20', 202410101530]
Matched Indices: [0, 15, 18, 20]
Non-Matched Indices: [1, 2, 4, 6, 8, 9, 11, 13, 22]




In [68]:
a=non_matched_list
b=non_matched_indices
print(a)
print(b)

[515685321, 9.34, 20241111, 659455638, 54596, 20241020, 20243088, '2022010112:20', 202410101530]
[1, 2, 4, 6, 8, 9, 11, 13, 22]


In [40]:
c=matched_list
d=matched_indices
print(c)
print(d)

['Country code', 'Indi', 'yes or no ', 'yes or no ']
[0, 15, 18, 20]


Date prediction

In [41]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import joblib
import re

# Load the dataset
file_path = '/content/date.csv'
df = pd.read_csv(file_path)

# Enhanced date format checking
def is_date_format(value):
    value = str(value)
    try:
        if len(value) == 8 and value.isdigit():
            pd.to_datetime(value, format='%Y%m%d', errors='raise')
            return True
        elif len(value) == 12 and value.isdigit():
            pd.to_datetime(value, format='%Y%m%d%H%M', errors='raise')
            return True
        elif len(value) == 13 and value[8:10] == ':' and value[:8].isdigit() and value[9:].isdigit():
            pd.to_datetime(value, format='%Y%m%d%H:%M', errors='raise')
            return True
    except ValueError:
        return False
    return False

def is_numeric(value):
    return value.isdigit()

def has_special_chars(value):
    return bool(re.search(r'[^\w\s]', value))

def is_alpha(value):
    return value.isalpha()

# Feature extraction function
def extract_features(values):
    values = values.astype(str)  # Ensure all values are strings
    features = pd.DataFrame()
    features['length'] = values.apply(len)
    features['is_date_format'] = values.apply(is_date_format)
    features['is_numeric'] = values.apply(is_numeric)
    features['has_special_chars'] = values.apply(has_special_chars)
    features['is_alpha'] = values.apply(is_alpha)
    return features

# Prepare features and labels
X_features = extract_features(df['value'])
y = df['label']

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_features, y_encoded, test_size=0.2, random_state=42)

# Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))

# Save the model and encoder
joblib.dump(model, '/content/date_classifier_model.pkl')
joblib.dump(label_encoder, '/content/label_encoder.pkl')


              precision    recall  f1-score   support

           5       0.00      0.00      0.00       0.0
           8       0.00      0.00      0.00       1.0
           9       0.00      0.00      0.00       1.0
          11       0.00      0.00      0.00       1.0

    accuracy                           0.00       3.0
   macro avg       0.00      0.00      0.00       3.0
weighted avg       0.00      0.00      0.00       3.0



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


['/content/label_encoder.pkl']

In [42]:
import pandas as pd
import joblib
import re

# Load the model and encoder
model = joblib.load('/content/date_classifier_model.pkl')
label_encoder = joblib.load('/content/label_encoder.pkl')

# Define the feature extraction function (same as used in training)
def extract_features(values):
    values = values.astype(str)  # Ensure all values are strings
    features = pd.DataFrame()
    features['length'] = values.apply(len)
    features['is_date_format'] = values.apply(is_date_format)
    features['is_numeric'] = values.apply(is_numeric)
    features['has_special_chars'] = values.apply(has_special_chars)
    features['is_alpha'] = values.apply(is_alpha)
    return features

# Define function to check if a value is a date format
def is_date_format(value):
    value = str(value)
    if value == '':
        return False
    try:
        if len(value) == 8 and value.isdigit():
            pd.to_datetime(value, format='%Y%m%d', errors='raise')
            return True
        elif len(value) == 12 and value.isdigit():
            pd.to_datetime(value, format='%Y%m%d%H%M', errors='raise')
            return True
        elif len(value) == 13 and value[8:10] == ':' and value[:8].isdigit() and value[9:].isdigit():
            pd.to_datetime(value, format='%Y%m%d%H:%M', errors='raise')
            return True
    except ValueError:
        return False
    return False

def is_numeric(value):
    return value.isdigit()

def has_special_chars(value):
    return bool(re.search(r'[^\w\s]', value))

def is_alpha(value):
    return value.isalpha()

# New values to predict
input_values = a #['20241010', '202410101530', '20241010:15', 'AB458622', '659455560', '']  # Example input including empty value
input_indices = b #[0, 1, 2, 3, 4, 5]  # Corresponding indices

new_values = pd.Series(input_values)

# Initialize lists for matched and non-matched values and indices
matched_list = []
non_matched_list = []
matched_indices = []
non_matched_indices = []

# Iterate over values, predictions, and indices
for index, value in zip(input_indices, new_values):
    if value == '':
        non_matched_list.append(value)  # Add empty values to non-matched list
        non_matched_indices.append(index)
        continue

    # Extract features for non-empty values
    X_new_features = extract_features(pd.Series([value]))

    # Predict
    y_new_pred = model.predict(X_new_features)
    predicted_label = label_encoder.inverse_transform(y_new_pred)[0]

    if is_date_format(value):
        matched_list.append(predicted_label)
        matched_indices.append(index)
    else:
        non_matched_list.append(value)
        non_matched_indices.append(index)

# Print results
print(f"Matched List: {matched_list}")
print(f"Non-Matched List: {non_matched_list}")
print(f"Matched Indices: {matched_indices}")
print(f"Non-Matched Indices: {non_matched_indices}")


Matched List: ['date4', 'date4', 'date2']
Non-Matched List: [515685321, 2.96, 659455596, 54595, 20243025, '2022010112:14']
Matched Indices: [4, 9, 22]
Non-Matched Indices: [1, 2, 6, 8, 11, 13]


In [43]:
e=matched_list
f=matched_indices
print(e)
print(f)

['date4', 'date4', 'date2']
[4, 9, 22]


In [44]:
g=non_matched_list
h=non_matched_indices
print(g)
print(h)

[515685321, 2.96, 659455596, 54595, 20243025, '2022010112:14']
[1, 2, 6, 8, 11, 13]


In [45]:
str_li=[str(item) for item in g]
print(str_li)

['515685321', '2.96', '659455596', '54595', '20243025', '2022010112:14']


In [46]:
import pandas as pd
import re
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Load training data from CSV
file_path = '/content/dataset_1.csv'
df = pd.read_csv(file_path)
#print("Loaded Data:")
#print(df)




# Feature extraction using .apply
df['length'] = df['value'].apply(len)
df['is_numeric_dm'] = df['value'].apply(lambda x: int(x.isdigit()))
df['is_alphabetic_dm'] = df['value'].apply(lambda x: int(x.isalpha()))
df['is_alphanumeric_dm'] = df['value'].apply(lambda x: int(x.isalnum()))
df['has_hyphen_dm'] = df['value'].apply(lambda x: int('-' in x))
df['has_letters_dm'] = df['value'].apply(lambda x: int(any(c.isalpha() for c in x)))
df['has_numbers_dm'] = df['value'].apply(lambda x: int(any(c.isdigit() for c in x)))
df['has_spaces_dm'] = df['value'].apply(lambda x: int(any(c.isspace() for c in x)))
df['has_special_chars_dm'] = df['value'].apply(lambda x: int(bool(re.search('[^a-zA-Z0-9\s]', x))))

df['is_numeric_re'] = df['value'].apply(lambda x: int(bool(re.match('^\d+$', x))))
df['is_alphabetic_re'] = df['value'].apply(lambda x: int(bool(re.match('^[a-zA-Z]+$', x))))
df['is_alphanumeric_re'] = df['value'].apply(lambda x: int(bool(re.match('^[a-zA-Z0-9]+$', x))))
df['has_hyphen_re'] = df['value'].apply(lambda x: int(bool(re.search('-', x))))
df['has_letters_re'] = df['value'].apply(lambda x: int(bool(re.search('[a-zA-Z]', x))))
df['has_numbers_re'] = df['value'].apply(lambda x: int(bool(re.search('[0-9]', x))))
df['has_spaces_re'] = df['value'].apply(lambda x: int(bool(re.search('\s', x))))
df['has_special_chars_re'] = df['value'].apply(lambda x: int(bool(re.search('[^a-zA-Z0-9\s]', x))))
df['is_exactly_3_digits'] = df['value'].apply(lambda x: int(bool(re.match('^\d{3}$', x))))

#print("Extracted Features:")
#print(df.head())

# Combine all features into a single DataFrame
combined_features = df[[
    'length',
    'is_numeric_dm', 'is_alphabetic_dm', 'is_alphanumeric_dm', 'has_hyphen_dm', 'has_letters_dm', 'has_numbers_dm', 'has_spaces_dm', 'has_special_chars_dm',
    'is_numeric_re', 'is_alphabetic_re', 'is_alphanumeric_re', 'has_hyphen_re', 'has_letters_re', 'has_numbers_re', 'has_spaces_re', 'has_special_chars_re',
    'is_exactly_3_digits'
]]

#print("Combined Features for Model Training:")
#print(combined_features)




# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(df['label'])
#print("Encoded Labels:")
#print(y_encoded)
#print("Label Distribution:")
#print(pd.Series(y_encoded).value_counts())

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(combined_features, y_encoded, test_size=0.2, random_state=42)

# Convert to TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((X_train.values, y_train)).batch(32)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test.values, y_test)).batch(32)


# Build a simple neural network model
model = Sequential([
    Dense(64, activation='relu', input_shape=(combined_features.shape[1],)),
    Dense(64, activation='relu'),
    Dense(len(le.classes_), activation='softmax')
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
model.fit(train_dataset, epochs=10, validation_data=test_dataset)

import pickle
# Save the trained model and label encoder to local files
model.save('tf_model.h5')

with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(le, f)

print("Model and label encoder saved successfully.")

Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - accuracy: 0.0000e+00 - loss: 2.0328 - val_accuracy: 0.5000 - val_loss: 1.3783
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.0000e+00 - loss: 1.9594 - val_accuracy: 0.5000 - val_loss: 1.4829
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.0000e+00 - loss: 1.8916 - val_accuracy: 0.5000 - val_loss: 1.5859
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.2500 - loss: 1.8295 - val_accuracy: 0.0000e+00 - val_loss: 1.6936
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step - accuracy: 0.2500 - loss: 1.7718 - val_accuracy: 0.0000e+00 - val_loss: 1.8066
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.5000 - loss: 1.7182 - val_accuracy: 0.0000e+00 - val_loss: 1.9215
Epoch 7/10
[1m1/1[0m [32m━━━━━━━



Model and label encoder saved successfully.


In [60]:
str_li

['515685321', '2.96', '659455596', '54595', '20243025', '2022010112:14']

In [47]:
# Load the trained model and label encoder
model = tf.keras.models.load_model('tf_model.h5')

with open('label_encoder.pkl', 'rb') as f:
    le = pickle.load(f)

# Manually provided list of data for prediction
data_to_predict = str_li #['654321', 'B123456789', '5678123456789', '6789', '3D4F5H', '666-7890', '490', '400']

# Prepare features for prediction using .apply
predict_df = pd.DataFrame(data_to_predict, columns=['value'])

predict_df['length'] = predict_df['value'].apply(len)
predict_df['is_numeric_dm'] = predict_df['value'].apply(lambda x: int(x.isdigit()))
predict_df['is_alphabetic_dm'] = predict_df['value'].apply(lambda x: int(x.isalpha()))
predict_df['is_alphanumeric_dm'] = predict_df['value'].apply(lambda x: int(x.isalnum()))
predict_df['has_hyphen_dm'] = predict_df['value'].apply(lambda x: int('-' in x))
predict_df['has_letters_dm'] = predict_df['value'].apply(lambda x: int(any(c.isalpha() for c in x)))
predict_df['has_numbers_dm'] = predict_df['value'].apply(lambda x: int(any(c.isdigit() for c in x)))
predict_df['has_spaces_dm'] = predict_df['value'].apply(lambda x: int(any(c.isspace() for c in x)))
predict_df['has_special_chars_dm'] = predict_df['value'].apply(lambda x: int(bool(re.search('[^a-zA-Z0-9\s]', x))))

predict_df['is_numeric_re'] = predict_df['value'].apply(lambda x: int(bool(re.match('^\d+$', x))))
predict_df['is_alphabetic_re'] = predict_df['value'].apply(lambda x: int(bool(re.match('^[a-zA-Z]+$', x))))
predict_df['is_alphanumeric_re'] = predict_df['value'].apply(lambda x: int(bool(re.match('^[a-zA-Z0-9]+$', x))))
predict_df['has_hyphen_re'] = predict_df['value'].apply(lambda x: int(bool(re.search('-', x))))
predict_df['has_letters_re'] = predict_df['value'].apply(lambda x: int(bool(re.search('[a-zA-Z]', x))))
predict_df['has_numbers_re'] = predict_df['value'].apply(lambda x: int(bool(re.search('[0-9]', x))))
predict_df['has_spaces_re'] = predict_df['value'].apply(lambda x: int(bool(re.search('\s', x))))
predict_df['has_special_chars_re'] = predict_df['value'].apply(lambda x: int(bool(re.search('[^a-zA-Z0-9\s]', x))))
predict_df['is_exactly_3_digits'] = predict_df['value'].apply(lambda x: int(bool(re.match('^\d{3}$', x))))

# Select features for prediction
X_input = predict_df[[
    'length',
    'is_numeric_dm', 'is_alphabetic_dm', 'is_alphanumeric_dm', 'has_hyphen_dm', 'has_letters_dm', 'has_numbers_dm', 'has_spaces_dm', 'has_special_chars_dm',
    'is_numeric_re', 'is_alphabetic_re', 'is_alphanumeric_re', 'has_hyphen_re', 'has_letters_re', 'has_numbers_re', 'has_spaces_re', 'has_special_chars_re',
    'is_exactly_3_digits'
]]

# Make predictions with the TensorFlow model
rf_predictions = model.predict(X_input)
rf_predictions = tf.argmax(rf_predictions, axis=1).numpy()

# Decode label predictions
predicted_labels = le.inverse_transform(rf_predictions)
pred=[]
# Print prediction results
for value, label in zip(data_to_predict, predicted_labels):
    #print(f"Input Data: {value} -> Prediction: {label}")
    pred.append(label)
#pred



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step


In [48]:
print(pred)
print(h)

['tax', 'tax', 'tax', 'balance', 'tax', 'tax']
[1, 2, 6, 8, 11, 13]


In [54]:
# Given lists
values1 = pred # ['tax', 'tax', 'tax', 'balance', 'tax', 'tax']
indices1 = h #[1, 2, 6, 8, 11, 13]

values2 = e #['date4', 'date4', 'date2']
indices2 = f # [4, 9, 22]

values3 = c #['Country code', 'Indi', 'yes or no', 'yes or no']
indices3 = d #[0, 15, 18, 20]

values4 = ['empty'] * len(emp_ind)
indices4 = emp_ind #[3, 5, 7, 10, 12, 14, 16, 17, 19, 21]

# Combine all values and indices
all_values = values1 + values2 + values3 + values4
all_indices = indices1 + indices2 + indices3 + indices4

# Determine the length of the final list
max_index = max(all_indices)
final_list = [''] * (max_index + 1)

# Place the values in the final list at the respective indices
for value, index in zip(all_values, all_indices):
    final_list[index] = value

print(final_list)


TypeError: can only concatenate list (not "_io.BufferedReader") to list

In [59]:
"""# Given lists
pred = ['tax', 'tax', 'tax', 'balance', 'tax', 'tax']
h = [1, 2, 6, 8, 11, 13]

e = ['date4', 'date4', 'date2']
f = [4, 9, 22]

c = ['Country code', 'Indi', 'yes or no', 'yes or no']
d = [0, 15, 18, 20]
"""
emp_ind # = [3, 5, 7, 10, 12, 14, 16, 17, 19, 21]
values4 = ['empty'] * len(emp_ind)
indices4 = emp_ind

# Combine all values and indices
all_values = pred + e + c + values4
all_indices = h + f + d + indices4

# Determine the length of the final list
max_index = max(all_indices)
final_list = [''] * (max_index + 1)

# Place the values in the final list at the respective indices
for value, index in zip(all_values, all_indices):
    final_list[index] = value

print(final_list)


['Country code', 'tax', 'tax', 'empty', 'date4', 'empty', 'tax', 'empty', 'balance', 'date4', 'empty', 'tax', 'empty', 'tax', 'empty', 'Indi', 'empty', 'empty', 'yes or no', 'empty', 'yes or no', 'empty', 'date2']
