In [None]:
# Content for the data_dis.csv file
csv_content = """value,label
9586867657,cus_num
A9586867657,cus_num
E267845,country
1345654787468,s_num
4535,soi_id
07865,soi_id
1A2B3C,custom_code
555-1234,phone_number
"""

# Specify the file path where you want to save the CSV file
file_path = 'data_dis.csv'

# Write the content to the CSV file
with open(file_path, 'w') as f:
    f.write(csv_content)

print(f"CSV file '{file_path}' has been successfully created with the following content:\n\n{csv_content}")


CSV file 'data_dis.csv' has been successfully created with the following content:

value,label
9586867657,cus_num
A9586867657,cus_num
E267845,country
1345654787468,s_num
4535,soi_id
07865,soi_id
1A2B3C,custom_code
555-1234,phone_number



In [None]:
!pip install pandas scikit-learn



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import joblib

In [None]:
# Load data from CSV
df = pd.read_csv('data_dis.csv')
print("Data Loaded:")
print(df)

Data Loaded:
           value         label
0     9586867657       cus_num
1    A9586867657       cus_num
2        E267845       country
3  1345654787468         s_num
4           4535        soi_id
5          07865        soi_id
6         1A2B3C   custom_code
7       555-1234  phone_number


In [None]:
# Feature Engineering: Extract length and character type features
df['length'] = df['value'].apply(len)
df['is_numeric'] = df['value'].apply(lambda x: x.isdigit())
df['is_alphanumeric'] = df['value'].apply(lambda x: x.isalnum())
df['has_hyphen'] = df['value'].apply(lambda x: '-' in x)
print("\nFeature Engineering Completed:")
print(df)


Feature Engineering Completed:
           value         label  length  is_numeric  is_alphanumeric  \
0     9586867657       cus_num      10        True             True   
1    A9586867657       cus_num      11       False             True   
2        E267845       country       7       False             True   
3  1345654787468         s_num      13        True             True   
4           4535        soi_id       4        True             True   
5          07865        soi_id       5        True             True   
6         1A2B3C   custom_code       6       False             True   
7       555-1234  phone_number       8       False            False   

   has_hyphen  
0       False  
1       False  
2       False  
3       False  
4       False  
5       False  
6       False  
7        True  


In [None]:
# Encode labels
le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'])
print("\nLabels Encoded:")
print(df)


Labels Encoded:
           value         label  length  is_numeric  is_alphanumeric  \
0     9586867657       cus_num      10        True             True   
1    A9586867657       cus_num      11       False             True   
2        E267845       country       7       False             True   
3  1345654787468         s_num      13        True             True   
4           4535        soi_id       4        True             True   
5          07865        soi_id       5        True             True   
6         1A2B3C   custom_code       6       False             True   
7       555-1234  phone_number       8       False            False   

   has_hyphen  label_encoded  
0       False              1  
1       False              1  
2       False              0  
3       False              4  
4       False              5  
5       False              5  
6       False              2  
7        True              3  


In [None]:
# Features and target
X = df[['length', 'is_numeric', 'is_alphanumeric', 'has_hyphen']]
y = df['label_encoded']

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("\nData Split into Training and Testing Sets")
print(f"X_train:\n{X_train}\nX_test:\n{X_test}\ny_train:\n{y_train}\ny_test:\n{y_test}")


Data Split into Training and Testing Sets
X_train:
   length  is_numeric  is_alphanumeric  has_hyphen
0      10        True             True       False
7       8       False            False        True
2       7       False             True       False
4       4        True             True       False
3      13        True             True       False
6       6       False             True       False
X_test:
   length  is_numeric  is_alphanumeric  has_hyphen
1      11       False             True       False
5       5        True             True       False
y_train:
0    1
7    3
2    0
4    5
3    4
6    2
Name: label_encoded, dtype: int64
y_test:
1    1
5    5
Name: label_encoded, dtype: int64


In [None]:
# Train and Evaluate Random Forest Classifier
model_rf = RandomForestClassifier()
model_rf.fit(X_train, y_train)
rf_predictions = model_rf.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_predictions)
print("\nRandom Forest Classifier")
print(f"Predictions: {rf_predictions}")
print(f"Accuracy: {rf_accuracy}")


Random Forest Classifier
Predictions: [0 5]
Accuracy: 0.5


In [None]:
# Save the trained model
joblib.dump(model_rf, 'random_forest_model.pkl')

['random_forest_model.pkl']

In [None]:
# Prepare data for TF-IDF Vectorizer models
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df['value'].astype(str))
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)
print("\nTF-IDF Vectorizer Completed")
print(f"X_train_tfidf:\n{X_train_tfidf}\nX_test_tfidf:\n{X_test_tfidf}")

# Train and Evaluate Logistic Regression with TF-IDF
model_lr = LogisticRegression()
model_lr.fit(X_train_tfidf, y_train)
lr_predictions = model_lr.predict(X_test_tfidf)
lr_accuracy = accuracy_score(y_test, lr_predictions)
print("\nLogistic Regression with TF-IDF")
print(f"Predictions: {lr_predictions}")
print(f"Accuracy: {lr_accuracy}")


TF-IDF Vectorizer Completed
X_train_tfidf:
  (0, 6)	1.0
  (1, 1)	0.7071067811865475
  (1, 5)	0.7071067811865475
  (2, 8)	1.0
  (3, 4)	1.0
  (4, 2)	1.0
  (5, 3)	1.0
X_test_tfidf:
  (0, 7)	1.0
  (1, 0)	1.0

Logistic Regression with TF-IDF
Predictions: [2 2]
Accuracy: 0.0


In [None]:
# Save the trained model
joblib.dump(model_lr, 'logistic_regression_model.pkl')

In [None]:
# Train and Evaluate SVM with TF-IDF
model_svm = SVC()
model_svm.fit(X_train_tfidf, y_train)
svm_predictions = model_svm.predict(X_test_tfidf)
svm_accuracy = accuracy_score(y_test, svm_predictions)
print("\nSupport Vector Machine with TF-IDF")
print(f"Predictions: {svm_predictions}")
print(f"Accuracy: {svm_accuracy}")


Support Vector Machine with TF-IDF
Predictions: [3 5]
Accuracy: 0.5


In [None]:
# Compare the performance
print("\nModel Comparison")
print(f"Random Forest Accuracy: {rf_accuracy}")
print(f"Logistic Regression with TF-IDF Accuracy: {lr_accuracy}")
print(f"Support Vector Machine with TF-IDF Accuracy: {svm_accuracy}")


Model Comparison
Random Forest Accuracy: 0.5
Logistic Regression with TF-IDF Accuracy: 0.0
Support Vector Machine with TF-IDF Accuracy: 0.5


In [None]:
# Save the trained model
joblib.dump(model_svm, 'svm_model.pkl')

In [None]:
import pickle

# Train and save Random Forest Classifier
model_rf = RandomForestClassifier()
model_rf.fit(X_train, y_train)
with open('model_rf.pkl', 'wb') as f:
    pickle.dump(model_rf, f)

# Train and save Logistic Regression with TF-IDF
model_lr = LogisticRegression()
model_lr.fit(X_train_tfidf, y_train)
with open('model_lr.pkl', 'wb') as f:
    pickle.dump(model_lr, f)

# Train and save SVM with TF-IDF
model_svm = SVC()
model_svm.fit(X_train_tfidf, y_train)
with open('model_svm.pkl', 'wb') as f:
    pickle.dump(model_svm, f)


Load Trained Models and Prepare Data

In [None]:
import pandas as pd
import pickle
from sklearn.preprocessing import LabelEncoder

# Load new data for prediction
new_data = pd.DataFrame({
    'value': ['123456', 'A9586867657', '1345654787468']
})

# Load models from pickle files
with open('model_rf.pkl', 'rb') as f:
    model_rf = pickle.load(f)

with open('model_lr.pkl', 'rb') as f:
    model_lr = pickle.load(f)

with open('model_svm.pkl', 'rb') as f:
    model_svm = pickle.load(f)

# Assuming label encoder was used during training, create a LabelEncoder instance and fit it with your labels
le = LabelEncoder()
labels = ['cus_num', 'country', 's_num', 'soi_id', 'custom_code', 'phone_number']
le.fit(labels)


In [None]:
#Random fores#
# Feature Engineering: Extract features from new data
new_data['length'] = new_data['value'].apply(len)
new_data['is_numeric'] = new_data['value'].apply(lambda x: x.isdigit())
new_data['is_alphanumeric'] = new_data['value'].apply(lambda x: x.isalnum())
new_data['has_hyphen'] = new_data['value'].apply(lambda x: '-' in x)

# Prepare features for prediction
X_new_rf = new_data[['length', 'is_numeric', 'is_alphanumeric', 'has_hyphen']]

# Make predictions with Random Forest model
rf_predictions = model_rf.predict(X_new_rf)
predicted_labels_rf = le.inverse_transform(rf_predictions)

# Output predictions
for value, pred_rf in zip(new_data['value'], predicted_labels_rf):
    print(f"Value: {value}")
    print(f"Random Forest Prediction: {pred_rf}")
    print()


Value: 123456
Random Forest Prediction: soi_id

Value: A9586867657
Random Forest Prediction: country

Value: 1345654787468
Random Forest Prediction: s_num



In [None]:
# Example if you have TF-IDF model saved
# Load TF-IDF Vectorizer (assuming it's already fitted)
with open('model_lr.pkl', 'rb') as f:
    tfidf_vectorizer = pickle.load(f)

# Transform new data using fitted TF-IDF Vectorizer
X_new_tfidf = tfidf_vectorizer.transform(new_data['value'].astype(str))

# Make predictions with Logistic Regression model
lr_predictions = model_lr.predict(X_new_tfidf)
predicted_labels_lr = le.inverse_transform(lr_predictions)

# Output predictions
for value, pred_lr in zip(new_data['value'], predicted_labels_lr):
    print(f"Value: {value}")
    print(f"Logistic Regression Prediction: {pred_lr}")
    print()


AttributeError: 'LogisticRegression' object has no attribute 'transform'

In [None]:
# Example if you have TF-IDF model saved
# Transform new data using fitted TF-IDF Vectorizer
X_new_tfidf = tfidf_vectorizer.transform(new_data['value'].astype(str))

# Make predictions with SVM model
svm_predictions = model_svm.predict(X_new_tfidf)
predicted_labels_svm = le.inverse_transform(svm_predictions)

# Output predictions
for value, pred_svm in zip(new_data['value'], predicted_labels_svm):
    print(f"Value: {value}")
    print(f"SVM Prediction: {pred_svm}")
    print()


AttributeError: 'LogisticRegression' object has no attribute 'transform'

In [None]:
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

# Load new data for prediction
new_data = pd.DataFrame({
    'value': ['123456', 'A9586867657', '1345654787468']
})

# Load TF-IDF Vectorizer (assuming it's already fitted during training)
with open('model_lr.pkl', 'rb') as f:
    tfidf_vectorizer = pickle.load(f)

# Transform new data using fitted TF-IDF Vectorizer
X_new_tfidf = tfidf_vectorizer.transform(new_data['value'].astype(str))

# Load Logistic Regression model
with open('model_lr.pkl', 'rb') as f:
    model_lr = pickle.load(f)

# Make predictions with Logistic Regression model
lr_predictions = model_lr.predict(X_new_tfidf)

# Assuming label encoder was used during training, create a LabelEncoder instance and fit it with your labels
le = LabelEncoder()
labels = ['cus_num', 'country', 's_num', 'soi_id', 'custom_code', 'phone_number']
le.fit(labels)

# Decode label predictions
predicted_labels_lr = le.inverse_transform(lr_predictions)

# Output predictions
for value, pred_lr in zip(new_data['value'], predicted_labels_lr):
    print(f"Value: {value}")
    print(f"Logistic Regression Prediction: {pred_lr}")
    print()


AttributeError: 'LogisticRegression' object has no attribute 'transform'

In [None]:
import pandas as pd
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

# Load CSV file
file_path = r'/content/input.xlsx'
df = pd.read_csv(file_path)

# Assuming the CSV file has a header row and the second row contains the data for prediction
input_data = df.iloc[1]['value']  # Assuming 'value' is the column name containing text data for prediction
#input_data

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x82 in position 16: invalid start byte

In [None]:
import pandas as pd
import pickle
from sklearn.preprocessing import LabelEncoder

# Load new data for prediction
new_data = pd.DataFrame({
    'value': ['123456', 'A9586867658', '1234567893456']
})

# Load models from pickle files
with open('model_rf.pkl', 'rb') as f:
    model_rf = pickle.load(f)

with open('model_lr.pkl', 'rb') as f:
    model_lr = pickle.load(f)

with open('model_svm.pkl', 'rb') as f:
    model_svm = pickle.load(f)

# Assuming label encoder was used during training, create a LabelEncoder instance and fit it with your labels
le = LabelEncoder()
labels = ['cus_num', 'country', 's_num', 'soi_id', 'custom_code', 'phone_number']
le.fit(labels)


#Random fores#
# Feature Engineering: Extract features from new data
new_data['length'] = new_data['value'].apply(len)
new_data['is_numeric'] = new_data['value'].apply(lambda x: x.isdigit())
new_data['is_alphanumeric'] = new_data['value'].apply(lambda x: x.isalnum())
new_data['has_hyphen'] = new_data['value'].apply(lambda x: '-' in x)

# Prepare features for prediction
X_new_rf = new_data[['length', 'is_numeric', 'is_alphanumeric', 'has_hyphen']]

# Make predictions with Random Forest model
rf_predictions = model_rf.predict(X_new_rf)
predicted_labels_rf = le.inverse_transform(rf_predictions)

# Output predictions
for value, pred_rf in zip(new_data['value'], predicted_labels_rf):
    print(f"Value: {value}")
    print(f"Random Forest Prediction: {pred_rf}")
    print()


Value: 123456
Random Forest Prediction: soi_id

Value: A9586867658
Random Forest Prediction: country

Value: 1234567893456
Random Forest Prediction: s_num



In [21]:
import pandas as pd

# Load training data from CSV
file_path = 'data_dis.csv'
df = pd.read_csv(file_path)
print("Loaded Data:")
print(df)


Loaded Data:
           value         label
0     9586867657       cus_num
1    A9586867657       cus_num
2        E267845       country
3  1345654787468         s_num
4           4535        soi_id
5          07865        soi_id
6         1A2B3C   custom_code
7       555-1234  phone_number


In [22]:
import re

# Feature engineering using direct methods and regular expressions
def extract_features(value):
    # Direct method feature extraction
    length = len(value)
    is_numeric_dm = int(value.isdigit())
    is_alphabetic_dm = int(value.isalpha())
    is_alphanumeric_dm = int(value.isalnum())
    has_hyphen_dm = int('-' in value)
    has_letters_dm = int(any(c.isalpha() for c in value))
    has_numbers_dm = int(any(c.isdigit() for c in value))
    has_spaces_dm = int(any(c.isspace() for c in value))
    has_special_chars_dm = int(bool(re.search('[^a-zA-Z0-9\s]', value)))

    # Regular expression feature extraction
    is_numeric_re = int(bool(re.match('^\d+$', value)))
    is_alphabetic_re = int(bool(re.match('^[a-zA-Z]+$', value)))
    is_alphanumeric_re = int(bool(re.match('^[a-zA-Z0-9]+$', value)))
    has_hyphen_re = int(bool(re.search('-', value)))
    has_letters_re = int(bool(re.search('[a-zA-Z]', value)))
    has_numbers_re = int(bool(re.search('[0-9]', value)))
    has_spaces_re = int(bool(re.search('\s', value)))
    has_special_chars_re = int(bool(re.search('[^a-zA-Z0-9\s]', value)))

    return [
        length,
        is_numeric_dm, is_alphabetic_dm, is_alphanumeric_dm, has_hyphen_dm, has_letters_dm, has_numbers_dm, has_spaces_dm, has_special_chars_dm,
        is_numeric_re, is_alphabetic_re, is_alphanumeric_re, has_hyphen_re, has_letters_re, has_numbers_re, has_spaces_re, has_special_chars_re
    ]

# Apply feature extraction to the data
features = df['value'].apply(extract_features).tolist()

# Create a DataFrame for the features
feature_columns = [
    'length',
    'is_numeric_dm', 'is_alphabetic_dm', 'is_alphanumeric_dm', 'has_hyphen_dm', 'has_letters_dm', 'has_numbers_dm', 'has_spaces_dm', 'has_special_chars_dm',
    'is_numeric_re', 'is_alphabetic_re', 'is_alphanumeric_re', 'has_hyphen_re', 'has_letters_re', 'has_numbers_re', 'has_spaces_re', 'has_special_chars_re'
]

features_df = pd.DataFrame(features, columns=feature_columns)
print("Extracted Features:")
print(features_df)


Extracted Features:
   length  is_numeric_dm  is_alphabetic_dm  is_alphanumeric_dm  has_hyphen_dm  \
0      10              1                 0                   1              0   
1      11              0                 0                   1              0   
2       7              0                 0                   1              0   
3      13              1                 0                   1              0   
4       4              1                 0                   1              0   
5       5              1                 0                   1              0   
6       6              0                 0                   1              0   
7       8              0                 0                   0              1   

   has_letters_dm  has_numbers_dm  has_spaces_dm  has_special_chars_dm  \
0               0               1              0                     0   
1               1               1              0                     0   
2               1           

In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pickle

# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(df['label'])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_df, y_encoded, test_size=0.2, random_state=42)

# Train Random Forest model
model_rf = RandomForestClassifier()
model_rf.fit(X_train, y_train)

# Save the trained model and label encoder to local files
with open('model_rf.pkl', 'wb') as f:
    pickle.dump(model_rf, f)

with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(le, f)

print("Model and label encoder saved successfully.")


Model and label encoder saved successfully.


In [24]:
# Manually provided list of data for prediction
data_to_predict = ['654321', 'B123456789', '5678123456789', '6789', '3D4F5H', '666-7890', '490', '400']

# Prepare features for prediction
X_input = [extract_features(value) for value in data_to_predict]

# Convert to DataFrame
X_input_df = pd.DataFrame(X_input, columns=feature_columns)

# Load Random Forest model
with open('model_rf.pkl', 'rb') as f:
    model_rf = pickle.load(f)

# Load label encoder
with open('label_encoder.pkl', 'rb') as f:
    le = pickle.load(f)

# Make predictions with Random Forest model
rf_predictions = model_rf.predict(X_input_df)

# Decode label predictions
predicted_labels = le.inverse_transform(rf_predictions)

# Print prediction results
for value, label in zip(data_to_predict, predicted_labels):
    print(f"Input Data: {value} -> Prediction: {label}")


Input Data: 654321 -> Prediction: soi_id
Input Data: B123456789 -> Prediction: country
Input Data: 5678123456789 -> Prediction: s_num
Input Data: 6789 -> Prediction: soi_id
Input Data: 3D4F5H -> Prediction: custom_code
Input Data: 666-7890 -> Prediction: phone_number
Input Data: 490 -> Prediction: soi_id
Input Data: 400 -> Prediction: soi_id
